def tfRecordToCaffe(self,
                        datasetName,
                        outputPath,
                        nameAsGroundTruth=False):
        MkDataSetStructure(os.path.join(outputPath, datasetName))
        fileManager = Tagger(os.path.join(outputPath, datasetName))

        dataBuffer = self.readTFRecord()
        for data in dataBuffer:
            if nameAsGroundTruth: imageName = data['filename']
            else: imageName = data["sourceID"]

            imageFilename = imageName + ".jpg"
            imageOutputPath = os.path.join(outputPath, datasetName, "Images")

            fileManager.AppendTrainingImg(imageName)
            self.saveFromRawImageData(data["imgEncoded"], data["height"],
                                      data["width"], imageOutputPath,
                                      imageFilename)

            for xMin, yMin, xMax, yMax, classText, classID in zip(
                    data["xMins"], data["yMins"], data["xMaxs"], data["yMaxs"],
                    data["classesText"], data["classesID"]):
                fileManager.AppendAnnotation(
                    (xMin, yMin), (xMax, yMax), imageName,
                    classText.decode('utf-8') + " " + str(classID))
Пример #2
0
    def path_input(self, pth):
        """Handle request for new input file. If new tag and display else display"""

        self.clear_results()
        tfiles = None

        #       Single image path
        if not isdir(pth):
            new = True
            #           SQL exception if path is not unique
            try:
                DataBase.add_image(pth)
            except IntegrityError:
                new = False

#          Tag new
            if new:
                tags = Tagger.tag_file(pth)
                for tag in tags:
                    DataBase.tag_image(tag, pth=pth)
#       Directory path
        else:
            tags, tfiles = Tagger.tag_dir(pth)
            for i in range(len(tfiles)):
                f = tfiles[i]
                #               Full path to image
                fpth = join(pth, f)
                tfiles[i] = fpth
                #               Continue if already present
                if DataBase.exists(pth=fpth):
                    continue
                else:
                    DataBase.add_image(fpth)
                    #                   Tuple results
                    if not isinstance(tags[i], str):
                        for t in tags[i]:
                            DataBase.tag_image(t, pth=fpth)
#                   String result
                    else:
                        DataBase.tag_image(tags[i], pth=fpth)

        L = 1
        #       Display
        if tfiles is None:
            self.queue_images(pth)
        else:
            #Number of listbox for results length
            L = len(tfiles)
            nlb = max(3, L)
            nlb = min(nlb, 12)
            self.ui.builder.get_object("ListResults").config(height=nlb)

            self.queue_images(tfiles)

        self.update_info("Processed " + str(L) + " images")
Пример #3
0
def cross_validate(count_words):
    ''' Runs cross validation on the Tagger, count_words = True iff the Tagger counts all the words,
        so P(word | tag) is known, but P(tag | prev_tag) is still only 90% known '''

    global total_errs, total_matches
    sum_err = 0
    print 'Fold    Err    Match    Frac_Match'

    # files in the corpus range from 0-100, so test and train ranges are slices of this
    # range, such that train_range and test_range together make range(0,100)
    for i in xrange(folds):
        train_range = range(0, chunk * i) + range(chunk * (i + 1), 100)
        test_range = range(chunk * i, chunk * (i + 1))
        total_errs = 0
        total_matches = 0

        tm = time.time()
        c = TagCounter()
        c.parse_corpus_range(train_range)
        if (count_words):
            c.only_words = count_words
            c.parse_corpus_range(test_range)
        t = Tagger(c)
        tm = time.time() - tm
        if timing:
            print tm,

        # file_validate is mapped across all files in test_range
        def file_validate(f):
            global total_errs, total_matches
            sentences = parse_file(f)
            for sent in sentences[:size]:
                words = []
                for word in sent[1:]:  # sent[0] will always be START
                    words.append(word.true_chars)
                tagged = t.tag_words(words)
                matches = 0
                errs = 0
                for (actual_w, pred_w) in zip(sent, tagged):
                    if actual_w.tag != pred_w.tag:
                        #print actual_w, pred_w     # prints the mistagged pairs
                        errs += 1
                    else:
                        matches += 1
                total_errs += errs
                total_matches += matches

        tm = time.time()
        map_files(file_validate, test_range)
        tm = time.time() - tm
        if timing:
            print tm
        print '%3d  %6d  %7d      %0.4f' % (i, total_errs, total_matches,
                                            percent_match(
                                                total_errs, total_matches))
        print ''

        sum_err += percent_match(total_errs, total_matches)

    print 'cumulative averaged error:', (sum_err * 1.0) / folds
Пример #4
0
def extract_entities(pmid):
    """
    This function tags genes and species in the PubMed article (based on the id provided).
    If possible these genes and species are further annotated using information present
    in the NCBi gene/taxonomy database. 
    :param pmid: The id of the article that should be tagged.
    :return: The genes (as Gene objects) and the organisms (as Organism object) found in the article. 
    """
    tagger = Tagger()
    tag_object = tagger.tag([pmid])
    genes = []
    organisms = []
    if tag_object:
        tag_object = tag_object[0]
        annotation = tag_object.get_annotation()
        genes = annotation.get("Gene", {None})
        organisms = annotation.get("Species", {None})
        genes = [convert_to_object(gene, "Gene") for gene in genes]
        organisms = [
            convert_to_object(organism, "Species") for organism in organisms
        ]
    return genes, organisms
Пример #5
0
class TaggerHandler:
  def __init__(self, dataDir, table):
    sys.stderr.write("TaggerHandler: Constructor\n")
    self.__TaggerInstance = Tagger()
    self.__dataDir = dataDir
    self.__tableFile = open(table,'w')
    
  def __updateTable(self, setting, accuracies):
    self.__tableFile.write(setting+'\t'+'\t'.join(map(lambda x:str(x), accuracies))+'\n')
  
  def __runTagger(self, trainFile, testFile):
    self.__TaggerInstance.loadData(trainFile, testFile)
    self.__TaggerInstance.train()
    accuracies = self.__TaggerInstance.test()
    setting = trainFile.split("_")[0]
    self.__updateTable(setting, accuracies)
  
  def run(self, trainFiles, testFiles):
    trainFiles = [self.__dataDir+line.strip() for line in open(trainFiles)]
    testFiles = [self.__dataDir+line.strip() for line in open(testFiles)]
    for trainFile in trainFiles:
      for testFile in testFiles:
        self.__runTagger(trainFile, testFile)
    self.__tableFile.close()
    def save_results(self):
        abs_filepath = os.path.abspath(self.outfile)

        total = Utils.get_count_of_sentences(self.testfile) * 1.0
        logging.info("{} sentences found".format(total))

        with open(abs_filepath, 'w') as f:
            for i, s in enumerate(Utils.get_sentence(self.testfile)):
                original_sentence = s['o']  # original raw sentence from file
                s = s['c']  # cleaned sentence
                untagged_sentence = self.remove_tags(s)
                untagged_original_sentence = self.remove_tags(
                    original_sentence)
                tags = Tagger(self.model, untagged_sentence).tag()
                tagged_sentence = self.attach_tags(untagged_original_sentence,
                                                   tags)
                Utils.write_sentence(f, tagged_sentence)

                if i % 100 == 0:
                    logging.info("{}% done. Last tagged: {}".format(
                        round(i / total * 100.0, 2), s.replace("\n", " ")))
                # break

            f.write(Utils.SENTENCE_SEPARATOR)
Пример #7
0
# -*- coding: cp1254 -*-

# Onur Yilmaz

# Imports
from Tagger import Tagger

# Open the file where tagger is saved
taggerFileName = 'my_tagger.yaml'
myTagger = Tagger.load(taggerFileName)


# Keep the original functionality intact
def tag(sentence):
    return myTagger.tag(sentence)


# End of code
Пример #8
0
# -*- coding: cp1254 -*-

# Onur Yilmaz

# Imports
from Tagger import Tagger

# Open the file where tagger is saved
taggerFileName = 'my_tagger.yaml'
myTagger = Tagger.load(taggerFileName)

# Keep the original functionality intact
def tag(sentence):
    return myTagger.tag(sentence)

# End of code
Пример #9
0
# Setting up directory
mypath = os.getcwd() + '/data/seminar_testdata/test_untagged/'
trainingPath = 'data/training'
directory = os.fsencode(mypath)

# Runs the ontology classification
print('Running Ontology Classification: \n')
ontology = Ontology()
ontology.run(mypath)

# Begins tagging
print("\nTagging progress beginning. Get a brew, it'll take a while... \n")
extractor = DataExtractor()

# Trains our model
extractor.train(trainingPath)
tagger = Tagger()

# Tags all emails in the directory given
tagger.tag_seminar(mypath, directory, extractor)

# Calculates how long the program took
seconds = time.time() - start_time
m, s = divmod(seconds, 60)
print("The program has been running for {0} minutes and {1} seconds \n".format(
    round(m), round(s)))

# Evaluates results
eval = Evaluation()
eval.run()
Пример #10
0
 def test_load_incorrectly(self):
     temporaryFileName = os.path.join(self.tempDir, 'temporary_file.txt')
     with open(temporaryFileName, 'w') as file:
         file.write("This is a line that won't be able to be read")
     with self.assertRaises(TypeError):
         Tagger.load(temporaryFileName)
Пример #11
0
 def setUp(self):
     self.filePath = 'my_tagger.yaml'
     self.tag = Tagger.load(self.filePath)
Пример #12
0
class TaggerHandler:
  def __init__(self, dataDir, table):
    sys.stderr.write("TaggerHandler: Constructor\n")
    self.__TaggerInstance = Tagger()
    self.__dataDir = dataDir
    self.__tableFile = open(table, 'w', 1)
    self.__tableFile.write('TrainCSType\tTrainPureCSSplit\tTrainSize\tExperimentType\tTestCSType\tTestPureCSSplit\tTestSize\tTagset\tOverallAccuracy\tSameContextAccuracy\tDifferentContextAccuracy\tPrevWordDifferentAccuracy\tPrePrevWordDifferentAccuracy\tUnknowns\n')
    
  def __updateTable(self, setting, accuracies):
    ##print accuracies
    self.__tableFile.write(setting + '\t' + '\t'.join(map(lambda x:str(x), accuracies)) + '\n')
  
  def __getSetting(self, string):
    string = string.split("/")[-1].split("TrainCS")[1]
    cstype = string.split("CS")[0]
    csSplit = string.split("CS")[1].split("Pure")[0]
    pureSplit = string.split("Pure")[1].split("Total")[0]
    pureCSSplit = pureSplit + '-' + csSplit 
    totalSize = string.split("Total")[1].split("_")[0]
    return '\t'.join([cstype, pureCSSplit, totalSize])
  
  def __runTagger(self, trainFile, testFile):
    self.__TaggerInstance.loadData(trainFile, testFile)
    self.__TaggerInstance.train()
    accuracies = self.__TaggerInstance.test()
    trainSetting = self.__getSetting(trainFile)
    testSetting = self.__getSetting(testFile)
    tagset = self.__tagset(trainFile)
    self.__updateTable(trainSetting + '\t' + testSetting + '\t' + tagset, accuracies)
    
  def __runTagger2(self, trainFile, testFile, expType):
    trainFile = self.__dataDir+trainFile
    testFile = self.__dataDir+testFile
    self.__TaggerInstance.loadData(trainFile, testFile)
    self.__TaggerInstance.train()
    accuracies = self.__TaggerInstance.test()
    trainSetting = self.__getSetting(trainFile)
    testSetting = self.__getSetting(testFile)
    tagset = self.__tagset(trainFile)
    self.__updateTable(trainSetting + '\t' + expType + '\t' + testSetting + '\t' + tagset, accuracies)
    
  def __tagset(self, string):
    tagset = "Mixed"
    if len(string.split(".")) > 1 and string.split(".")[1] == "uni":
      tagset = 'Universal'
    if string.find(".uniq") >= 0:
      tagset += ".uniq"
    return tagset
  
  def run(self, trainFiles, testFiles):
    #trainFiles = [self.__dataDir+line.strip() for line in open(trainFiles)]
    #testFiles = [self.__dataDir+line.strip() for line in open(testFiles)]
    trainFiles = [line.strip() for line in open(trainFiles)]
    testFiles = [line.strip() for line in open(testFiles)]
    for trainFile in trainFiles:
      #if trainFile.find("Type1")>=0 or trainFile.find("Type0")>=0:
      #  continue
      for testFile in testFiles:
        if self.__tagset(trainFile) != "Mixed" or self.__tagset(trainFile) != self.__tagset(testFile):
        #if self.__tagset(trainFile)!= self.__tagset(testFile):
          continue
        ##if testFile.find("CS0Pure100")<0:
        ##  continue
        ##print testFile
        self.__runTagger(trainFile, testFile)
    self.__tableFile.close()
    
  def run2(self, trainFiles, testFiles):
    trainFiles = [line.strip() for line in open(trainFiles)]
    testFiles = [line.strip() for line in open(testFiles)]
    for trainFile in trainFiles:
      for testFile in testFiles:
        if self.__tagset(trainFile) != self.__tagset(testFile):
          continue
        controlTrainFile = trainFile + "_Control"
        if self.__tagset(trainFile) == "Universal":
          controlTrainFile = trainFile.split(".uni")[0] + "_Control" + ".uni"
        ##if testFile.find("CS0Pure100")<0:
        ##  continue
        ##print testFile
        self.__runTagger2(trainFile, testFile, "Experiment")
        self.__runTagger2(controlTrainFile, testFile, "Control")
    self.__tableFile.close()
Пример #13
0
    print "("+token+")",
print "\n\nNUMBER OF TOKENS ANSWER,TEST"
print len(gold_tokens),len(test_tokens)
print "\n\nDIFFERENCE\nONLY IN TEST"
difference = set(test_tokens)-set(gold_tokens)
for token in difference:
    print token+"  ",
print "\n\nONLY IN ANSWER"
difference = set(gold_tokens)-set(test_tokens)
for token in difference:
    print token+"  ",
print "\n"


print "--TAGGER--"
tagger = Tagger(tagged_train_sents)
print "EVALUATE TAGGER"
print "RATE"
tagger.evaluate(tagged_gold_sents)
print "\n\nANSWER"
tagged_gold_tokens = sum(tagged_gold_sents,[])
for tup in tagged_gold_tokens:
        print str(tup[0])+"/"+str(tup[1]),
print "\n\nTEST"
gold_tokens = []
for sent in tagged_gold_sents:
    for tup in sent:
        gold_tokens.append(tup[0])
tagged_test_tokens = tagger.tag(gold_tokens)
for tup in tagged_test_tokens:
    print str(tup[0])+"/"+str(tup[1]),
Пример #14
0
    return float(orphan_count) / len(tag_occurrences)


def calculate_combined(tagging_user):
    """
    returns the combined measure
    """
    return (get_orphaniness(tagging_user) + get_cond_entropy_normalized(tagging_user)) / 2

    
        
                
                                
if __name__ == '__main__':
    TAGGER = Tagger("hugo")
    TAG_SET = set()
    TAG_SET.add("computer")
    TAG_SET.add("reference")
    
    TAGGER.add_post("1", TAG_SET)
    TAG_SET.clear()
    
    TAG_SET.add("reference")
    TAG_SET.add("calculator")
    TAG_SET.add("rate")
    
    TAGGER.add_post("2", TAG_SET)
    
    TAG_SET.clear()
    
Пример #15
0
######### hmm-tagger.py #########

from Tagger import Tagger # import the tagging controller
import os # for path info


# initialize a tagging object with the cleaned corpus file(s)
t = Tagger(os.getcwd()+'/', ['text_1.txt', 'text_2.txt', 'text_3.txt'], ['text_5.txt'])

# perform ten-fold cross-validation
t.run_test_cycles()
Пример #16
0
 def setUp(self):
     self.filePath = 'my_tagger.yaml'
     self.tag = Tagger.load(self.filePath)
Пример #17
0
 def test_load_incorrectly(self):
     temporaryFileName = os.path.join(self.tempDir, 'temporary_file.txt')
     with open(temporaryFileName, 'w') as file:
         file.write("This is a line that won't be able to be read")
     with self.assertRaises(TypeError):
         Tagger.load(temporaryFileName)
Пример #18
0
 def test_load_nonexisting(self):
     with self.assertRaises(FileNotFoundError):
         Tagger.load("this_file_definitely_doesnt_exist.txt")
Пример #19
0
 def test_load_file(self):
     tagger = Tagger.load(self.filePath)
     self.assertIsInstance(tagger.myTagger, BrillTagger)
Пример #20
0
 def __init__(self, dataDir, table):
   sys.stderr.write("TaggerHandler: Constructor\n")
   self.__TaggerInstance = Tagger()
   self.__dataDir = dataDir
   self.__tableFile = open(table, 'w', 1)
   self.__tableFile.write('TrainCSType\tTrainPureCSSplit\tTrainSize\tExperimentType\tTestCSType\tTestPureCSSplit\tTestSize\tTagset\tOverallAccuracy\tSameContextAccuracy\tDifferentContextAccuracy\tPrevWordDifferentAccuracy\tPrePrevWordDifferentAccuracy\tUnknowns\n')
Пример #21
0
######### hmm-tagger.py #########

from TreebankCleaner import TreebankCleaner  # import cleaning class
from Tagger import Tagger  # import the tagging controller
import os  # for path info
import sys  # for command line options

if '--clean' in sys.argv:
    # initialize treebank cleaner with the current path and pre-downloaded file(s)
    t = TreebankCleaner(os.getcwd() + '/', ['treebank3_sect2.txt'])
    # do cleaning
    t.clean()

# initialize a tagging object with the cleaned corpus file(s)
t = Tagger(os.getcwd() + '/', ['treebank3_sect2.txt_cleaned'])

# perform ten-fold cross-validation
t.run_test_cycles()
Пример #22
0
 def test_load_nonexisting(self):
     with self.assertRaises(FileNotFoundError):
         Tagger.load("this_file_definitely_doesnt_exist.txt")
Пример #23
0
 def test_load_file(self):
     tagger = Tagger.load(self.filePath)
     self.assertIsInstance(tagger.myTagger, BrillTagger)
Пример #24
0
 def __init__(self, dataDir, table):
   sys.stderr.write("TaggerHandler: Constructor\n")
   self.__TaggerInstance = Tagger()
   self.__dataDir = dataDir
   self.__tableFile = open(table,'w')