def run_txt2seg(rconfig, limit, options, verbose): """Takes txt files and runs the Chinese segmenter on them.""" input_dataset = find_input_dataset(TXT2SEG, rconfig) output_dataset = find_output_dataset(TXT2SEG, rconfig) print_datasets(TXT2SEG, input_dataset, output_dataset) check_file_counts(input_dataset, output_dataset, limit) count = 0 segmenter = sdp.Segmenter() swrapper = cn_txt2seg.SegmenterWrapper(segmenter) fspecs = get_lines(rconfig.filenames, output_dataset.files_processed, limit) for fspec in fspecs: count += 1 filename = fspec.target print_file_progress(TXT2SEG, rconfig.corpus, count, filename, verbose) file_in, file_out = prepare_io(filename, input_dataset, output_dataset) uncompress(file_in) #cn_txt2seg.seg(file_in, file_out, segmenter) swrapper.process(file_in, file_out) compress(file_in, file_out) if count % STEP == 0: output_dataset.update_processed_count(STEP) return (count % STEP, [output_dataset])
def run_txt2tag(rconfig, limit, options, verbose): """Takes txt files and runs the tagger on them.""" input_dataset = find_input_dataset(TXT2TAG, rconfig) output_dataset = find_output_dataset(TXT2TAG, rconfig) print_datasets(TXT2TAG, input_dataset, output_dataset) check_file_counts(input_dataset, output_dataset, limit) count = 0 tagger = txt2tag.get_tagger(rconfig.language) fspecs = get_lines(rconfig.filenames, output_dataset.files_processed, limit) for fspec in fspecs: count += 1 filename = fspec.target print_file_progress(TXT2TAG, rconfig.corpus, count, filename, verbose) file_in, file_out = prepare_io(filename, input_dataset, output_dataset) uncompress(file_in) txt2tag.tag(file_in, file_out, tagger) # this will become relevant for cn only when we have a segmenter/tagger # that uses only one step if rconfig.language == 'en': compress(file_in, file_out) if count % STEP == 0: output_dataset.update_processed_count(STEP) return (count % STEP, [output_dataset])
def run_classifier(self, t1, corpus): ensure_path(self.output) self._create_mallet_file(corpus=corpus) self._run_classifier() self._calculate_scores() self._create_info_files(t1) for fname in (self.results_file, self.mallet_file, self.scores_s1): print "[Classifier.run_classifier] Compressing", fname compress(fname)
def run(self): if os.path.exists(self.info_file_general): sys.exit("WARNING: already have classifier results in %s" % self.batch) ensure_path(self.batch) self._find_datasets() self._create_mallet_file() self._run_classifier() self._calculate_scores() self._run_eval() self._create_info_files() compress(self.results_file, self.mallet_file, self.scores_s1)
def mallet_train_classifier(self): commands = [self.mallet_config.cmd_csv2vectors_train, self.mallet_config.cmd_train_classifier, self.mallet_config.cmd_classifier2info, self.mallet_config.cmd_cinfo_sorted] if self.mallet_config.prune_p: commands.append(self.mallet_config.cmd_prune) for cmd in commands: print "[mallet_train_classifier]" run_command(cmd) compress(self.mallet_config.cinfo_file, self.mallet_config.cinfo_sorted_file, self.mallet_config.train_vectors_file, self.mallet_config.train_vectors_out_file, self.mallet_config.train_mallet_file)
def run_populate(rconfig, limit, verbose=False): """Populate xml directory in the target directory with limit files from the source file list or the source directory.""" output_name = DOCUMENT_PROCESSING_IO[POPULATE]['out'] dataset = DataSet(POPULATE, output_name, rconfig) # initialize data set if it does not exist, this is not contingent on # anything because --populate is the first step if not dataset.exists(): dataset.initialize_on_disk() dataset.load_from_disk() fspecs = get_lines(rconfig.filenames, dataset.files_processed, limit) print "[--populate] adding %d files to %s" % (len(fspecs), dataset) count = 0 for fspec in fspecs: count += 1 src_file = fspec.source dst_file = os.path.join(rconfig.target_path, 'data', output_name, dataset.version_id, 'files', fspec.target) # allow for compressed files, while being handed the name without # extension if not os.path.exists(src_file): src_file += ".gz" dst_file += ".gz" if verbose: print "[--populate] %04d %s" % (count, dst_file) ensure_path(os.path.dirname(dst_file)) try: shutil.copyfile(src_file, dst_file) except IOError: print " WARNING: source file does not exist, not copying" print " %s" % src_file # at some point there seemed to be an issue with compressing for Chinese, # so added this to do language dependent compressing, there is now no # difference for the population phase if rconfig.language == 'en': compress(dst_file) elif rconfig.language == 'cn': compress(dst_file) # TODO: does this mean that you miss some if total_count % STEP != 0 if count % STEP == 0: dataset.update_processed_count(STEP) return (count % STEP, [dataset])
def run_xml2txt(rconfig, limit, options, verbose=False): """Takes the xml file and produces a txt file with a simplified document structure, keeping date, title, abstract, summary, description_rest, first_claim and other_claims. Does this by calling the document structure parser in onto mode if the document source is LEXISNEXIS and uses a simple parser defined in xml2txt if the source is WOS..""" input_dataset = find_input_dataset(XML2TXT, rconfig) output_dataset = find_output_dataset(XML2TXT, rconfig) print_datasets(XML2TXT, input_dataset, output_dataset) check_file_counts(input_dataset, output_dataset, limit) count = 0 doc_parser = make_parser(rconfig.language) workspace = os.path.join(rconfig.target_path, 'data', 'workspace') fspecs = get_lines(rconfig.filenames, output_dataset.files_processed, limit) for fspec in fspecs: count += 1 filename = fspec.target print_file_progress(XML2TXT, rconfig.corpus, count, filename, verbose) file_in, file_out = prepare_io(filename, input_dataset, output_dataset) uncompress(file_in) try: xml2txt.xml2txt(doc_parser, rconfig.datasource, file_in, file_out, workspace) except Exception as e: # just write an empty file that can be consumed downstream fh = codecs.open(file_out, 'w') fh.close() print "[--xml2txt] WARNING: error on", file_in #print " ", e # we now do compress the cn output of the document parser (which we # initialy did not do) if rconfig.language == 'en': compress(file_in, file_out) elif rconfig.language == 'cn': compress(file_in, file_out) if count % STEP == 0: output_dataset.update_processed_count(STEP) #xml2txt.print_stats() return (count % STEP, [output_dataset])
def run_seg2tag(rconfig, limit, options, verbose): """Takes seg files and runs the Chinese tagger on them.""" input_dataset = find_input_dataset(SEG2TAG, rconfig) output_dataset = find_output_dataset(SEG2TAG, rconfig) print_datasets(SEG2TAG, input_dataset, output_dataset) check_file_counts(input_dataset, output_dataset, limit) count = 0 tagger = txt2tag.get_tagger(rconfig.language) fspecs = get_lines(rconfig.filenames, output_dataset.files_processed, limit) for fspec in fspecs: count += 1 filename = fspec.target print_file_progress(SEG2TAG, rconfig.corpus, count, filename, verbose) file_in, file_out = prepare_io(filename, input_dataset, output_dataset) uncompress(file_in) cn_seg2tag.tag(file_in, file_out, tagger) compress(file_in, file_out) if count % STEP == 0: output_dataset.update_processed_count(STEP) return (count % STEP, [output_dataset])
def compress_files(self): compress(self.mallet_config.test_mallet_file, self.mallet_config.classifier_out_file)
def copy_and_compress(source, target_dir, target_file): """Copy source to target file, making sure there is a directory. Compress the new file.""" ensure_path(target_dir) shutil.copyfile(source, target_file) compress(target_file)