Exemplo n.º 1
0
def main():

    mrsty_file=sys.argv[3]
    original_filename=sys.argv[2]
    data_store_name=sys.argv[1]
    original_file=Text(bz2.BZ2File(original_filename, 'r'))
    print "Loading semantic types from %s" % mrsty_file
    stypes=SemanticTypes()
    stypes.build_from_mrsty_file(MRSTYTable(bz2.BZ2File(mrsty_file)))
    print "Semantic types loaded."
    print "Turning the data from %s into %s. Please wait." % (
            original_filename, data_store_name)
    data_store=StringDBDict(data_store_name, 
                            sync_every_transactions=0,
                            write_out_every_transactions=200000,
                            file_mode='c')
    data_store.sync_every=0
    build_concept_dictionary(original_file, data_store, stypes)
    data_store.sync_every=100
    print "Conversion done."
Exemplo n.º 2
0
 def __iter__(self):
     """Iterates over the file, skipping lines that contain ignorable
     snippets and constructing line objects of the specified type for all
     others. Lines that raise exceptions are never reported as they are
     malformed. However, they can be examined by ignore_exception and (if
     ignore_exception returns True) parsing may continue.
     
     We will only allow ParsingErrors to be caught, which should be enough
     to ignore truly known parsing problems.
     """
     for line in Text.__iter__(self):
         if self.is_ignorable(line.lower()):
             continue
         try:
             new_line = self.__line_type(line)
         except ParsingError, which_exception:
             if self.ignore_exception(which_exception, line):
                 pass
             else:
                 logging.error("Unignorable exception on line '%s'", line)
                 raise
         else:
             yield new_line
Exemplo n.º 3
0
 def __iter__(self):
     """Iterates over the file, skipping lines that contain ignorable
     snippets and constructing line objects of the specified type for all
     others. Lines that raise exceptions are never reported as they are
     malformed. However, they can be examined by ignore_exception and (if
     ignore_exception returns True) parsing may continue.
     
     We will only allow ParsingErrors to be caught, which should be enough
     to ignore truly known parsing problems.
     """
     for line in Text.__iter__(self):
         if self.is_ignorable(line.lower()):
             continue
         try:
             new_line=self.__line_type(line)
         except ParsingError, which_exception:
             if self.ignore_exception(which_exception, line):
                 pass
             else:
                 logging.error("Unignorable exception on line '%s'", line)
                 raise
         else:
             yield new_line
Exemplo n.º 4
0
 def __repr__(self):
     return "<%s file based on %r>" % (self.__class__.__name__,
                                       Text.__repr__(self))
Exemplo n.º 5
0
 def __init__(self, fileobject, type_of_lines, lines_to_ignore):
     Text.__init__(self, fileobject)
     self.__line_type = type_of_lines
     # Since all files are lowercased and stripped, the lines to ignore
     # must be so, too.
     self.__lines_to_ignore = [x.lower().strip() for x in lines_to_ignore]
Exemplo n.º 6
0
 def __repr__(self):
     return "<%s file based on %r>" % (self.__class__.__name__,
                                       Text.__repr__(self))
Exemplo n.º 7
0
 def __init__(self, fileobject, type_of_lines, lines_to_ignore):
     Text.__init__(self, fileobject)
     self.__line_type=type_of_lines
     # Since all files are lowercased and stripped, the lines to ignore
     # must be so, too.
     self.__lines_to_ignore=[x.lower().strip() for x in lines_to_ignore]