Exemplo n.º 1
0
 def __init__(self, text=None, filename=None):
   # if text is None, initialize using filename
   # if filename is None, initialize using text
   # if both are None, throw exception
   # text, filename are of type String
   if text == None:
     if filename == None:
       print "File not found: " + filename
     else: self.text = self.strip(Util.open_file(filename).read())
   else: self.text = self.strip(text)
Exemplo n.º 2
0
  def learn(self, dir, easy):
    # in this sequence, labeled documents are provided
    # dir is the filename of the directory containing the labeled documents
    # in labeled documents, the first line is the heuristic and the second is the text
    # if easy is True, we allow the program to learn based on the correct heuristic
    # if easy is False, the analyzer will assume its guess is correct

    # get total number of files in directory
    num_files = sum(os.path.isfile(f) for f in glob.glob(dir + "/*"))
    # tracker to store number of correct guesses so far
    total_correct = 0
    # total correctly formatted files so far
    total_files = 0
    # document names should be formatted as "n.txt", where n goes from 0 to num_files - 1
    for i in range(num_files):
      # filename
      fn = dir + "/" + str(i) + ".txt"
      # open the new file
      f = Util.open_file(fn)
      # if the file is not None (i.e. opening the file was successful)
      if f:
        # calculate correct and guessed heuristics
        correct_heuristic = f.readline()[:-1]
        text = f.readline()
        guessed_heuristic = self.analyze(None, text)
        # token to show user whether program guessed correctly or not
        was_correct = "N"
        # if the two are equal...
        if correct_heuristic == guessed_heuristic:
          # change token to yes
          was_correct = "Y"
          # increment total correct
          total_correct += 1
        # increment total files
        total_files += 1
        # add new document to dictionaries
        if easy:
          self.add(Document(None, fn), correct_heuristic, True)
        else:
          self.add(Document(None, fn), guessed_heuristic, True)
        # print results
        print "File " + str(i) + ": " + was_correct + ". " + str(total_correct) + "/" + str(total_files)
Exemplo n.º 3
0
  def __init__(self, seed=None):
    # if no seed is provided, return an error
    if seed == None:
      print "Initializing an Analyzer object requires a seed.  Please try again."
      return
    # initialize an empty dictionary
    dict = {}
    # ---USER FEEDBACK---
    print "\nAttempting to read seed file..."
    # open the seed file
    f = Util.open_file(seed)
    # if the file doesn't exist, return
    if not f:
      print "Initialization failed; please verify that the seed exists then try again."
      return
    # begin reading
    while True :
      # read heuristic line
      line = f.readline()
      # quit if end of file
      if not line : break
      # store new heuristic
      current_heuristic = Util.strip(line)
      # ---USER FEEDBACK---
      print "Reading files for heuristic \'" + current_heuristic + "\'..."
      # read filenames
      next_line = f.readline()
      # if there isn't another line, quit - incorrect syntax
      if not next_line : 
        print ("Incorrect seed structure.  Exiting")
        sys.exit()
      # try to store number of files for this heuristic
      try:
        num_files = int(next_line)
      # if an exception is thrown...
      except ValueError:
        # print out an error and return nothing
        print "Seed file is of incorrect format.  Please try again."
        return
      # create document array variable
      docs = []
      # iterate over files
      for i in range(num_files) :
        # try to open the file
        filename = current_heuristic + "/" + str(i) + ".txt"
        new_doc = Document(None, filename)
        # if the new document's text is successfuly initialized...
        if new_doc.text :  
          # add it to the array
          docs.append(new_doc)
      # add new heuristic and docs to dict
      dict[current_heuristic] = docs
      # store dictionary
      self.dict = dict

    # ---USER FEEDBACK---
    print "Done reading files!\n"

    # calculate required values
    # initialize dictionaries
    self.word_counts = {}
    self.log_values = {}
    # analyze for each heuristic found
    for key in self.dict:
      self.consolidate(key)
      self.transform(key, True)

    # ---USER FEEDBACK---
    print "Analyzer object initialized!\n"