def __init__(self, a_subcorpus, tag, a_cursor=None, extension="name", indexing="word"): abstract_bank.__init__(self, a_subcorpus, tag, extension) if (a_cursor == None): sys.stderr.write("reading the name bank [%s]..." % self.extension) for a_file in self.subcorpus.get_files(self.extension): sys.stderr.write(".") name_file = codecs.open(a_file.physical_filename, "r", "utf-8") try: name_tagged_document_string = name_file.read() finally: name_file.close() name_tagged_document_id = "%s@%s" % (a_file.document_id, self.subcorpus.id) a_name_tagged_document = name_tagged_document( name_tagged_document_string, name_tagged_document_id, self.extension, indexing=indexing) self.append(a_name_tagged_document) sys.stderr.write("\n") else: pass
def __init__(self, a_subcorpus, tag, a_cursor=None, extension="parallel"): abstract_bank.__init__(self, a_subcorpus, tag, extension) self.matching_parallel_banks = [] self.matching_treebanks = [] self.matching_subcorpora = [] if (a_cursor == None): sys.stderr.write("reading the parallel bank [%s] ..." % self.extension) for a_file in self.subcorpus.get_files(self.extension): sys.stderr.write(".") with codecs.open(a_file.physical_filename, "r", "utf-8") as f: parallel_file_lines = f.readlines() if parallel_file_lines[0].startswith("original document"): """ we want to map translated documents back to their originals. There are two mapping files, which means there is redundant information, and we just need to do thing with the parallel files that represent translations. """ continue self.append( parallel_document.from_file(parallel_file_lines, a_file.document_id, a_subcorpus.id, self.extension)) sys.stderr.write("\n") else: pass
def __init__(self, a_subcorpus, tag, a_cursor=None, extension="parallel"): abstract_bank.__init__(self, a_subcorpus, tag, extension) self.matching_parallel_banks = [] self.matching_treebanks = [] self.matching_subcorpora = [] if(a_cursor == None): sys.stderr.write("reading the parallel bank [%s] ..." % self.extension) for a_file in self.subcorpus.get_files(self.extension): sys.stderr.write(".") with codecs.open(a_file.physical_filename, "r", "utf-8") as f: parallel_file_lines = f.readlines() if parallel_file_lines[0].startswith("original document"): """ we want to map translated documents back to their originals. There are two mapping files, which means there is redundant information, and we just need to do thing with the parallel files that represent translations. """ continue self.append(parallel_document.from_file(parallel_file_lines, a_file.document_id, a_subcorpus.id, self.extension)) sys.stderr.write("\n") else: pass
def __init__(self, a_subcorpus, tag, a_cursor=None, extension="speaker"): abstract_bank.__init__(self, a_subcorpus, tag, extension) if(a_cursor == None): sys.stderr.write("reading the speaker bank [%s] ..." % self.extension) for a_file in self.subcorpus.get_files(self.extension): sys.stderr.write(".") with codecs.open(a_file.physical_filename, "r", "utf-8") as f: speaker_file_lines = f.readlines() self.append(speaker_document.from_file(speaker_file_lines, a_file.document_id + "@" + a_subcorpus.id, self.extension)) sys.stderr.write("\n") else: pass
def __init__(self, a_subcorpus, tag, a_cursor=None, extension="name", indexing="word"): abstract_bank.__init__(self, a_subcorpus, tag, extension) if(a_cursor == None): sys.stderr.write("reading the name bank [%s]..." % self.extension) for a_file in self.subcorpus.get_files(self.extension): sys.stderr.write(".") name_file = codecs.open(a_file.physical_filename, "r", "utf-8") try: name_tagged_document_string = name_file.read() finally: name_file.close() name_tagged_document_id = "%s@%s" % (a_file.document_id, self.subcorpus.id) a_name_tagged_document = name_tagged_document(name_tagged_document_string, name_tagged_document_id, self.extension, indexing=indexing) self.append(a_name_tagged_document) sys.stderr.write("\n") else: pass