Пример #1
0
def start():
    config = on.common.util.load_options(positional_args=False)
    a_ontonotes = on.ontonotes(config)

    for a_subcorpus in a_ontonotes:
        print "  >> processing subcorpus %s" % a_subcorpus.id
        process_subcorpus_a(a_subcorpus)
Пример #2
0
def load_ontonotes(corpus):
    assert corpus in consts.valid_ontonotes_corpus, \
        'ontonotes corpora can only be one of {}'.format(
            consts.valid_ontonotes_corpus)

    log.info('Reading Ontonotes corpus {} from {}'.format(
        corpus, cfg.ontonotes_root))

    on_cfg = get_default_ontonotes_config()
    on_cfg.set('corpus', 'data_in', cfg.ontonotes_root)
    on_cfg.set('corpus', 'load', corpus)

    start_time = timeit.default_timer()

    # suppress stderr, as the following commands print too much useless info
    null_fd, save_fd = supress_fd(2)

    a_ontonotes = on.ontonotes(on_cfg)

    assert len(a_ontonotes) == 1
    corpus = a_ontonotes[0]

    # restore stderr
    restore_fd(2, null_fd, save_fd)

    elapsed = timeit.default_timer() - start_time
    log.info('Done in {:.3f} seconds'.format(elapsed))

    log.info('Found {} files with extensions {}'.format(
        len(corpus['document']), on_cfg.get('corpus', 'banks').split()))

    return corpus
Пример #3
0
def start():
    config = on.common.util.load_options(positional_args=False)
    a_ontonotes = on.ontonotes(config)

    for a_subcorpus in a_ontonotes:
        print "  >> processing subcorpus %s" % a_subcorpus.id
        process_subcorpus_a(a_subcorpus)
Пример #4
0
def load_to_db():

    config = on.common.util.load_options(positional_args=False)
    a_ontonotes = on.ontonotes(config)
    a_cursor = a_ontonotes.db_cursor(config)

    for a_subcorpus in a_ontonotes:
        a_subcorpus.write_to_db(a_cursor)
    a_ontonotes.write_type_tables_to_db(a_cursor)
Пример #5
0
def load_ontonotes(corpora):
    assert corpora in consts.VALID_ONTONOTES_CORPORA, \
        'ontonotes corpora can only be one of {}'.format(
            consts.VALID_ONTONOTES_CORPORA)
    cfg = get_default_ontonotes_cfg()
    cfg.set('corpus', 'load', corpora)
    a_ontonotes = on.ontonotes(cfg)
    all_subcorps = []
    for subcorp in a_ontonotes:
        all_subcorps.append(subcorp)
    return all_subcorps
Пример #6
0
def create_onfs():
    """ Reads a configuration from config_fname to decide what files
    to load to the database.
    """
    config = on.common.util.load_options(positional_args=False)

    a_ontonotes = on.ontonotes(config)

    for a_subcorpus in a_ontonotes:
        print "Loading", a_subcorpus.id
        a_subcorpus["parse"].dump_onf(a_cursor=None, out_dir=config["out", "out_dir"])
Пример #7
0
def create_onfs():
    """ Reads a configuration from config_fname to decide what files
    to load to the database.
    """
    config = on.common.util.load_options(positional_args=False)

    a_ontonotes = on.ontonotes(config)

    for a_subcorpus in a_ontonotes:
        print "Loading", a_subcorpus.id
        a_subcorpus["parse"].dump_onf(a_cursor=None,
                                      out_dir=config["out", "out_dir"])
Пример #8
0
def start():
    config = on.common.util.load_options(positional_args=False)
    a_ontonotes = on.ontonotes(config)
    
    a_cursor = on.ontonotes.db_cursor(config)
    out_dir = config["FilesFromDb", "out_dir"]

    if not a_ontonotes:
        raise Exception("Failed to load anything")

    for a_subcorpus in a_ontonotes:
        for a_bank in a_subcorpus.itervalues():
            a_bank.dump_view(a_cursor, out_dir)
Пример #9
0
def start():
    config = on.common.util.load_options(positional_args=False)
    a_ontonotes = on.ontonotes(config)

    a_cursor = on.ontonotes.db_cursor(config)
    out_dir = config["FilesFromDb", "out_dir"]

    if not a_ontonotes:
        raise Exception("Failed to load anything")

    for a_subcorpus in a_ontonotes:
        for a_bank in a_subcorpus.itervalues():
            a_bank.dump_view(a_cursor, out_dir)
Пример #10
0
 def getAOntonotes(self):
     '''
     See on/__init__.py for usages of ontonotes object
     '''
     '''
     Create a config object
     '''
     cfg = on.common.util.load_options(positional_args=False)
     '''
     Create an ontonotes object by passing in a config object
     '''
     a_ontonotes = on.ontonotes(cfg)
     return a_ontonotes
Пример #11
0
 def getAOntonotes(self):
     '''
     See on/__init__.py for usages of ontonotes object
     '''
     '''
     Create a config object
     '''
     cfg = on.common.util.load_options(positional_args=False)
     '''
     Create an ontonotes object by passing in a config object
     '''
     a_ontonotes = on.ontonotes(cfg)
     
     return a_ontonotes
Пример #12
0
    def loadOntonotes(self):
        """
        load banks from ontonote
        See on/__init__.py for usages of ontonotes object
        """

        '''
        Create a config object and parse the command line options and arguments
        '''
        cfg = on.common.util.load_options(positional_args=True)
        #print "cfg:",cfg[1][0]
        '''
        get headrules
        '''
        #headrules = cfg[1][0]
        head_trees_path = cfg[1][0]
        '''
        Create an ontonotes object by passing in a config object
        '''

        a_ontonotes = on.ontonotes(cfg[0])
        return (a_ontonotes,head_trees_path)
Пример #13
0
Optionally a MySQL database may be used for the dataAnalyzer, but it is
easier just to have the API access the text files directly.
Refer to the PDF version of the documentation for details (the 
online HTML version may be out of date):
  http://cemantix.org/download/ontonotes/beta/doc/pdf/on.pdf
  
@author: Nathan Schneider (nschneid)
@since: 2012-06-05
'''

from __future__ import print_function
import on, sys

cfgFP = sys.argv[1]
cfg = on.common.util.load_config(cfgFP)
all_on = on.ontonotes(cfg)


def describe_prop(p, indent=''):
    #print(p.corpus_id, p.document_id, p.tree_id, p.id)
    print(indent, p.lemma, p.pb_sense_num, p.quality,
          p.get_primary_predicate().token_index)
    print(indent, '-' * len(p.lemma) + '---')
    for agroup in p:  # argument analogue, i.e. group of coreferent argument fillers
        for a in agroup:
            if isinstance(a, on.corpora.proposition.predicate):
                print(indent, a.enc_self, a.type, a.token_index)
            else:
                print(indent, a.enc_self, a.type)
                for anode in a:
                    print(indent, '   ', anode.subtree.get_word_string())
Optionally a MySQL database may be used for the data, but it is 
easier just to have the API access the text files directly.
Refer to the PDF version of the documentation for details (the 
online HTML version may be out of date):
  http://cemantix.org/download/ontonotes/beta/doc/pdf/on.pdf
  
@author: Nathan Schneider (nschneid)
@since: 2012-06-05
'''

from __future__ import print_function
import on, sys

cfgFP = sys.argv[1]
cfg = on.common.util.load_config(cfgFP)
all_on = on.ontonotes(cfg)

def describe_prop(p, indent=''):
   #print(p.corpus_id, p.document_id, p.tree_id, p.id)
   print(indent, p.lemma, p.pb_sense_num, p.quality, p.get_primary_predicate().token_index)
   print(indent, '-'*len(p.lemma)+'---')
   for agroup in p: # argument analogue, i.e. group of coreferent argument fillers
     for a in agroup:
       if isinstance(a, on.corpora.proposition.predicate):
         print(indent, a.enc_self, a.type, a.token_index)
       else:
         print(indent, a.enc_self, a.type)
         for anode in a:
           print(indent, '   ', anode.subtree.get_word_string())
           
MAX_TREES = 10