Exemplo n.º 1
0
 def move(self, tokenizedfile):
     sourcefile = tokenizedfile
     tokenfile = tokenizedfile + '.tok'
     utils.move(os.path.join(self.input_loc, sourcefile), os.path.join(config.backup_area, sourcefile))
     utils.move(os.path.join(self.input_loc, tokenfile), os.path.join(config.processing_area, tokenfile))
     print '%s - Moved files: %s and %s.' % (unicode(datetime.datetime.now()), sourcefile, tokenfile)
     self.status = 0
Exemplo n.º 2
0
    def tag(self, tokenfile):
        '''
        Tags text files.

        :param tokenfile:   Name of a tokenized file.
        :return:            No return value, instead creates a file that contains all detected entitites.
        '''

        # print '%s - Now tagging: %s' % (unicode(datetime.datetime.now()), tokenfile)

        open_tokenfile = open(os.path.join(self.processing_loc, tokenfile), 'r')
        token_list = open_tokenfile.readlines()
        tagged_list = self.tagger.tag(token_list)

        tag_resultfile = codecs.open(os.path.join(self.processing_loc, '%s.tag' % tokenfile), mode='w+', encoding='utf-8')

        position = 1

        for tagged_token in tagged_list:
            # Increase position counter only for words, not for punctuation marks and quotations.
            if (tagged_token[0] not in '!?&"#,;:.-\/()[]{}=') and (tagged_token[0] != '``') and (tagged_token[0] != '\'\''):
                position += 1
                # Write token + tag + position to file, if token is not tagged as 'O' (other).
                if tagged_token[1] != 'O':
                    # tag_resultfile.write('%s %s %s \n') % (tagged_token[1], unicode(position), tagged_token[0])
                    tag_resultfile.write(tagged_token[1] + ' ' + unicode(position) + ' ' + tagged_token[0] + '\n')

        print '%s - Successfully tagged the file: %s' % (unicode(datetime.datetime.now()), tokenfile)

        tag_resultfile.close()
        utils.move(os.path.join(self.processing_loc, '%s.tag' % tokenfile), os.path.join(self.output_loc, '%s.tag' % tokenfile))

        self.status = 1