def move(self, tokenizedfile): sourcefile = tokenizedfile tokenfile = tokenizedfile + '.tok' utils.move(os.path.join(self.input_loc, sourcefile), os.path.join(config.backup_area, sourcefile)) utils.move(os.path.join(self.input_loc, tokenfile), os.path.join(config.processing_area, tokenfile)) print '%s - Moved files: %s and %s.' % (unicode(datetime.datetime.now()), sourcefile, tokenfile) self.status = 0
def tag(self, tokenfile): ''' Tags text files. :param tokenfile: Name of a tokenized file. :return: No return value, instead creates a file that contains all detected entitites. ''' # print '%s - Now tagging: %s' % (unicode(datetime.datetime.now()), tokenfile) open_tokenfile = open(os.path.join(self.processing_loc, tokenfile), 'r') token_list = open_tokenfile.readlines() tagged_list = self.tagger.tag(token_list) tag_resultfile = codecs.open(os.path.join(self.processing_loc, '%s.tag' % tokenfile), mode='w+', encoding='utf-8') position = 1 for tagged_token in tagged_list: # Increase position counter only for words, not for punctuation marks and quotations. if (tagged_token[0] not in '!?&"#,;:.-\/()[]{}=') and (tagged_token[0] != '``') and (tagged_token[0] != '\'\''): position += 1 # Write token + tag + position to file, if token is not tagged as 'O' (other). if tagged_token[1] != 'O': # tag_resultfile.write('%s %s %s \n') % (tagged_token[1], unicode(position), tagged_token[0]) tag_resultfile.write(tagged_token[1] + ' ' + unicode(position) + ' ' + tagged_token[0] + '\n') print '%s - Successfully tagged the file: %s' % (unicode(datetime.datetime.now()), tokenfile) tag_resultfile.close() utils.move(os.path.join(self.processing_loc, '%s.tag' % tokenfile), os.path.join(self.output_loc, '%s.tag' % tokenfile)) self.status = 1