Пример #1
0
def extract(setting):
    """
    Extract features from corpus files, producing instance files and true
    corpus files.
    
    @param setting: Setting instance specifying the experimental setting
    """
    if setting.extract:
        log.info("\n" + header("EXTRACT STEP"))
        
        makedirs(setting.inst_dir)
        makedirs(setting.true_dir)
        
        if setting.develop:
            inst_fns = setting.make_inst_fns(setting.dev_part_fns)
            true_fns = setting.make_true_fns(setting.dev_part_fns)
        
            extract_files(
                setting.extractor,
                setting.graph_selector,
                setting.dev_part_fns,
                inst_fns,
                true_fns,
                binary=setting.binary)
        if setting.validate:
            inst_fns = setting.make_inst_fns(setting.val_part_fns)
            true_fns = setting.make_true_fns(setting.val_part_fns)
            
            extract_files(
                setting.extractor,
                setting.graph_selector,
                setting.val_part_fns,
                inst_fns,
                true_fns,
                binary=setting.binary)
Пример #2
0
def evaluate(setting):
    """
    Evaluate development data
    
    @param setting: Setting instance specifying the experimental setting
    """
    if setting.evaluate:
        log.info("\n" + header("EVALUATE STEP"))
        makedirs(setting.eval_dir)
            
        if setting.develop:
            setting.dev_eval = eval_files(
                setting.dev_true_fns,
                setting.dev_pred_fns,
                setting.dev_eval_fname,
                align_eval=setting.evaluator,
                n=setting.n)
             
        if setting.validate:
            setting.val_eval = eval_files(
                setting.val_true_fns,
                setting.val_pred_fns,
                setting.val_eval_fname,
                align_eval=setting.evaluator,
                n=setting.n)
Пример #3
0
def extract(setting):
    """
    Extract features from corpus files, producing instance files and true
    corpus files.
    
    @param setting: Setting instance specifying the experimental setting
    """
    if setting.extract:
        log.info("\n" + header("EXTRACT STEP"))

        makedirs(setting.inst_dir)
        makedirs(setting.true_dir)

        if setting.develop:
            inst_fns = setting.make_inst_fns(setting.dev_part_fns)
            true_fns = setting.make_true_fns(setting.dev_part_fns)

            extract_files(setting.extractor,
                          setting.graph_selector,
                          setting.dev_part_fns,
                          inst_fns,
                          true_fns,
                          binary=setting.binary)
        if setting.validate:
            inst_fns = setting.make_inst_fns(setting.val_part_fns)
            true_fns = setting.make_true_fns(setting.val_part_fns)

            extract_files(setting.extractor,
                          setting.graph_selector,
                          setting.val_part_fns,
                          inst_fns,
                          true_fns,
                          binary=setting.binary)
Пример #4
0
def classify_file(train_inst_fns,
                  test_inst_fns,
                  out_fns=None,
                  log_fn=None,
                  clas_dir=None,
                  descriptor=None,
                  timbl=None,
                  options="",
                  log=False):
    """
    Classify instances using Timbl
    
    @param train_inst_fns: a list of instance filenames for training

    @param test_inst_fns: a list of instance filenames for testing
    
    @keyword out_fns: list of classifier output files to be created
    
    @keyword log_fn: classifier log file to be created;
    ignored if keyword log is false
    
    @keyword clas_dir: directory for creating classifier output files; ignored
    if out_fns is given
    
    @keyword descriptor: a Descriptor instance, required to infer the feature
    metrics for Timbl, unless a TimblFile is supplied; ignored if timbl is
    supplied
    
    @keyword timbl: a tailored TimblFile instance; notice that it must at
    least set the verbosity options +vo, +vdb, +vdi, and the -m option to
    specify that the administrative features must be ignored.
    
    @keyword options: list of additional Timbl options, excluding -f, -m, +vo,
    +vdb, +vdi    
    
    @keyword log: log Timbl's standard output and error streams to file
    
    @return: list of Timbl output filenames
    """
    if clas_dir:
        makedirs(clas_dir)

    if not timbl:
        assert descriptor
        timbl = TimblFile(default_opts=timbl_options_string(descriptor))
    else:
        # ignore descriptor
        assert isinstance(timbl, TimblFile)
        assert "+vo" in timbl.default_opts
        assert "+vdb" in timbl.default_opts
        assert "+vdi" in timbl.default_opts
        assert "-m" in timbl.default_opts

    return timbl.train_test_multi(train_inst_fns,
                                  test_inst_fns,
                                  out_fns=out_fns,
                                  log_fn=log_fn,
                                  options=options,
                                  log=log,
                                  out_dir=clas_dir)
Пример #5
0
def merge(setting):
    """
    Merge data
    
    @param setting: Setting instance specifying the experimental setting
    """
    if setting.merge:
        log.info("\n" + header("MERGE STEP"))        
        makedirs(setting.pred_dir)
        
        if setting.develop:
            pred_fns = setting.make_pred_fns(setting.dev_true_fns)
            
            merge_files(
                setting.dev_inst_fns, 
                setting.dev_true_fns,
                pred_fns,
                merger=setting.merger, 
                descriptor=setting.descriptor, 
                n=setting.n, 
                binary=setting.binary)
        if setting.validate:
            pred_fns = setting.make_pred_fns(setting.val_true_fns)
            
            merge_files(
                setting.val_inst_fns, 
                setting.val_true_fns,
                pred_fns,
                merger=setting.merger, 
                descriptor=setting.descriptor, 
                n=setting.n, 
                binary=setting.binary)
Пример #6
0
def classify_file(train_inst_fns,
                  test_inst_fns,
                  out_fns=None,
                  log_fn=None,
                  clas_dir=None,
                  descriptor=None,
                  timbl=None,
                  options="",
                  log=False):
    """
    Classify instances using Timbl
    
    @param train_inst_fns: a list of instance filenames for training

    @param test_inst_fns: a list of instance filenames for testing
    
    @keyword out_fns: list of classifier output files to be created
    
    @keyword log_fn: classifier log file to be created;
    ignored if keyword log is false
    
    @keyword clas_dir: directory for creating classifier output files; ignored
    if out_fns is given
    
    @keyword descriptor: a Descriptor instance, required to infer the feature
    metrics for Timbl, unless a TimblFile is supplied; ignored if timbl is
    supplied
    
    @keyword timbl: a tailored TimblFile instance; notice that it must at
    least set the verbosity options +vo, +vdb, +vdi, and the -m option to
    specify that the administrative features must be ignored.
    
    @keyword options: list of additional Timbl options, excluding -f, -m, +vo,
    +vdb, +vdi    
    
    @keyword log: log Timbl's standard output and error streams to file
    
    @return: list of Timbl output filenames
    """
    if clas_dir:
        makedirs(clas_dir)
        
    if not timbl:
        assert descriptor
        timbl = TimblFile(default_opts=timbl_options_string(descriptor))
    else:
        # ignore descriptor
        assert isinstance(timbl, TimblFile)
        assert "+vo" in timbl.default_opts
        assert "+vdb" in timbl.default_opts
        assert "+vdi" in timbl.default_opts
        assert "-m" in timbl.default_opts
    
    return timbl.train_test_multi(train_inst_fns, test_inst_fns,
                                  out_fns=out_fns, log_fn=log_fn, options=options, log=log,
                                  out_dir=clas_dir)
Пример #7
0
def create_part_files(parts, 
                      base_dir=os.getenv("DAESO_CORPUS"),
                      part_dir=None,
                      max_size=None):
    """
    Create the parallel graph corpora constituting the parts for training and
    testing
    
    @param parts: a dictionary where each key specifies the filename for the
    part and each value a sequence of parallel graph corpora filenames merged
    into the part
    
    @keyword base_dir: filename paths of the original corpus files must be
    relative to base_dir
    
    @keyword part_dir: the destination directory for the parts, which will be
    created if it does not exist.
    
    @keyword max_size: limits the maximal number of corpus files per part,
    which is sometimes useful for try-out experiments with only a small number
    of corpus files.
    
    @return: a list of part filenames created
    
    Note that the created parts cannot be moved, because they depend on the
    graph bank files of the original pgc files from which they were derived.
    """
    if not part_dir:
        part_dir = os.getcwd()
    else:
        makedirs(part_dir)

    part_fnames = []
        
    for part_name, corpus_fnames in sorted(parts.items()):
        part_fname = os.path.join(part_dir, part_name)
        corpus_fnames = [ os.path.join(base_dir, fname) 
                          for fname in corpus_fnames ]
        corpus = join_pgc(corpus_fnames[:max_size])

        # graphbank file paths by default become relative to the new pgc file
        log.info("saving part file {0}".format(part_fname))
        corpus.write(part_fname, pprint=True)
        part_fnames.append(part_fname)
        
    return part_fnames
Пример #8
0
def sample(setting):
    """
    Sample training data
    
    @param setting: Setting instance specifying the experimental setting
    """
    if setting.sample:
        log.info("\n" + header("SAMPLE STEP"))
        makedirs(setting.samp_dir)

        if setting.develop:
            samp_fns = setting.make_samp_fns(setting.dev_inst_fns)

            sample_file(setting.class_fracts, setting.dev_inst_fns, samp_fns)
        if setting.validate:
            samp_fns = setting.make_samp_fns(setting.val_inst_fns)

            sample_file(setting.class_fracts, setting.val_inst_fns, samp_fns)
Пример #9
0
def sample(setting):
    """
    Sample training data
    
    @param setting: Setting instance specifying the experimental setting
    """
    if setting.sample:
        log.info("\n" + header("SAMPLE STEP"))
        makedirs(setting.samp_dir)

        if setting.develop:
            samp_fns = setting.make_samp_fns(setting.dev_inst_fns)

            sample_file(setting.class_fracts, setting.dev_inst_fns, samp_fns)
        if setting.validate:
            samp_fns = setting.make_samp_fns(setting.val_inst_fns)

            sample_file(setting.class_fracts, setting.val_inst_fns, samp_fns)
Пример #10
0
def eval_corpora(true_corpora, pred_corpora, names, eval_fname,
                 align_eval=None, n=None):
    """
    Evaluate predicted against true parallel graph corpora.
    
    @param true_fns: iterable of true corpora
    
    @param pred_fns: iterable of predicted corpora
    
    @param names: iterable of labels for true/predicted pairs
    
    @param eval_fname: name of file to which evaluation output is written 
    
    @keyword align_eval: AlignEval instance
    
    @keyword n: limit evaluation to the first n files
    """
    if align_eval:
        assert isinstance(align_eval, AlignEval)
        # reset evaluator to prevent accidents
        align_eval.__init__()
    else:
        align_eval = AlignEval()
    
    count = 0

    for true_corpus, pred_corpus, name in itertools.izip(true_corpora, 
                                                         pred_corpora,
                                                         names):
        align_eval.add(true_corpus, pred_corpus, name)   
        count += 1
        if count == n:
            break
        
    align_eval.run_eval()
    log.info("saving evaluation report {0}".format(eval_fname))
    makedirs(os.path.dirname(eval_fname))
    align_eval.write(eval_fname)
    return align_eval
Пример #11
0
def classify(setting):
    """
    Classify corpus instances

    @param setting: Setting instance specifying the experimental setting
    """
    if setting.classify:
        log.info("\n" + header("CLASSIFY STEP"))

        makedirs(setting.clas_dir)

        if setting.train_sample:
            train_inst_fns = setting.dev_samp_fns
        else:
            train_inst_fns = setting.dev_inst_fns

        if setting.develop:
            classify_file_cv(
                train_inst_fns,
                test_inst_fns=setting.dev_inst_fns,
                out_fns=setting.make_out_fns(setting.dev_inst_fns),
                log_fns=setting.make_log_fns(setting.dev_inst_fns),
                descriptor=setting.descriptor,
                timbl=setting.classifier,
                options=setting.timbl_opts,
                n=setting.n,
                log=setting.timbl_log)
        if setting.validate:
            classify_file(train_inst_fns,
                          setting.val_inst_fns,
                          out_fns=setting.make_out_fns(setting.val_inst_fns),
                          log_fn=setting.make_log_fname(
                              setting.val_inst_fns[0]),
                          descriptor=setting.descriptor,
                          timbl=setting.classifier,
                          options=setting.timbl_opts,
                          log=setting.timbl_log)
Пример #12
0
def classify(setting):
    """
    Classify corpus instances

    @param setting: Setting instance specifying the experimental setting
    """    
    if setting.classify:
        log.info("\n" + header("CLASSIFY STEP"))
        
        makedirs(setting.clas_dir)
        
        if setting.train_sample:
            train_inst_fns = setting.dev_samp_fns
        else:
            train_inst_fns = setting.dev_inst_fns
        
        if setting.develop:
            classify_file_cv(
                train_inst_fns,
                test_inst_fns=setting.dev_inst_fns,
                out_fns=setting.make_out_fns(setting.dev_inst_fns),
                log_fns=setting.make_log_fns(setting.dev_inst_fns),
                descriptor=setting.descriptor,
                timbl=setting.classifier,
                options=setting.timbl_opts,
                n=setting.n,
                log=setting.timbl_log)
        if setting.validate:
            classify_file(
                train_inst_fns,
                setting.val_inst_fns,
                out_fns=setting.make_out_fns(setting.val_inst_fns),
                log_fn=setting.make_log_fname(setting.val_inst_fns[0]),
                descriptor=setting.descriptor,
                timbl=setting.classifier,
                options=setting.timbl_opts,
                log=setting.timbl_log)
Пример #13
0
def exp_dev_fast(setting):
    """
    perform a fast alignment experiment on development data
    
    Weighting, matching and merging takes place per test corpus without
    writing intermediary results to a file.
    """
    assert setting.develop and not setting.validate

    exp_init(setting)

    create_parts(setting)

    # It's impossible to do extraction one corpus a time, because in order to
    # classify a test corpus you need instances for all the other training
    # corpora! Moreover, since Timbl classification is file-based, we need to
    # write the corpus instance files to disk. These files can be huge and
    # keeping all of them in memory seems to offer little benefit.
    extract(setting)

    sample(setting)

    # Timbl writes its output to a file, which then needs to be parsed in
    # order to insert the class predictions and weights into the corpus
    # instances. That means there is no advantage to doing classification
    # one corpus a time.
    classify(setting)

    log.info("\n" + header("WEIGHT/MATCH/MERGE STEP"))
    # reset evaluator
    if setting.evaluate:
        setting.evaluator.__init__()

    scope = zip(setting.dev_inst_fns, setting.dev_clas_fns, setting.dev_true_fns)[: setting.n]

    for inst_fname, out_fname, true_fname in scope:
        log.info("reading corpus instances {0}".format(inst_fname))
        corpus_inst = CorpusInst()
        corpus_inst.loadtxt(inst_fname, setting.descriptor.dtype)

        if setting.weight:
            log.info("reading classifier output {0}".format(out_fname))
            timbl_out = parse_timbl_output(open(out_fname))
            log.info("weighting...")
            weight_corpus(corpus_inst, timbl_out, setting.weight_func)

        if setting.match:
            log.info("matching...")
            match_corpus(corpus_inst, setting.matcher)

        if setting.merge:
            log.info("reading true corpus {0}".format(true_fname))
            true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE)
            log.info("merging...")
            pred_corpus = merge_corpus(corpus_inst, true_corpus, setting.merger)

        if setting.evaluate:
            name = os.path.basename(true_fname).split("_")[0]
            setting.evaluator.add(true_corpus, pred_corpus, name)

    if setting.evaluate:
        log.info("evaluting...")
        setting.evaluator.run_eval()
        log.info("saving evaluation {0}".format(setting.dev_eval_fname))
        makedirs(setting.eval_dir)
        setting.evaluator.write(setting.dev_eval_fname)

    exp_exit(setting)
Пример #14
0
def classify_file_cv(inst_fns,
                     test_inst_fns=None,
                     out_fns=None,
                     log_fns=None,
                     clas_dir=None,
                     descriptor=None,
                     timbl=None,
                     options="",
                     n=None,
                     log=False):
    """
    Classify instance using Timbl in a cross-validation procedure.
    
    @param inst_fns: a list of instance filenames for training; if no
    test_inst_fns is supplied, the same files will be used for testing,
    otherwise they are used for training only
    
    @keyword test_inst_fns: a list of instance filenames for testing; this
    allows for down-sampling of the training instances without affecting the
    test instances
    
    @keyword out_fns: list of classifier output files to be created
    
    @keyword log_fns: list of classifier log files to be created; 
    ignored if keyword log is false
    
    @keyword clas_dir: directory for creating classifier output files; ignored
    if out_fns is given
    
    @keyword descriptor: a Descriptor instance, required to infer the feature
    metrics for Timbl, unless a TimblFile is supplied; ignored if timbl is
    supplied
    
    @keyword timbl: a tailored TimblFile instance; notice that it must at
    least set the verbosity options +vo, +vdb, +vdi, and the -m option to
    specify that the administrative features must be ignored.
    
    @keyword options: list of additional Timbl options, excluding -f, -m, +vo,
    +vdb, +vdi
    
    @keyword n: limit on the number of cross-validations performed (by default
    equals the number of instance filenames)
    
    @keyword log: log Timbl's standard output and error streams to file
    
    @return: list of Timbl output filenames
    """
    if clas_dir:
        makedirs(clas_dir)

    if not timbl:
        assert descriptor
        timbl = TimblFile(default_opts=timbl_options_string(descriptor))
    else:
        # ignore descriptor
        assert isinstance(timbl, TimblFile)
        assert "+vo" in timbl.default_opts
        assert "+vdb" in timbl.default_opts
        assert "+vdi" in timbl.default_opts
        assert "-m" in timbl.default_opts

    return timbl.cross_validate(inst_fns,
                                test_inst_fns=test_inst_fns,
                                out_fns=out_fns,
                                log_fns=log_fns,
                                options=options,
                                n=n,
                                log=log,
                                out_dir=clas_dir)
Пример #15
0
def classify_file_cv(inst_fns,
                     test_inst_fns=None,
                     out_fns=None,
                     log_fns=None,
                     clas_dir=None,
                     descriptor=None,
                     timbl=None,
                     options="",
                     n=None,
                     log=False):
    """
    Classify instance using Timbl in a cross-validation procedure.
    
    @param inst_fns: a list of instance filenames for training; if no
    test_inst_fns is supplied, the same files will be used for testing,
    otherwise they are used for training only
    
    @keyword test_inst_fns: a list of instance filenames for testing; this
    allows for down-sampling of the training instances without affecting the
    test instances
    
    @keyword out_fns: list of classifier output files to be created
    
    @keyword log_fns: list of classifier log files to be created; 
    ignored if keyword log is false
    
    @keyword clas_dir: directory for creating classifier output files; ignored
    if out_fns is given
    
    @keyword descriptor: a Descriptor instance, required to infer the feature
    metrics for Timbl, unless a TimblFile is supplied; ignored if timbl is
    supplied
    
    @keyword timbl: a tailored TimblFile instance; notice that it must at
    least set the verbosity options +vo, +vdb, +vdi, and the -m option to
    specify that the administrative features must be ignored.
    
    @keyword options: list of additional Timbl options, excluding -f, -m, +vo,
    +vdb, +vdi
    
    @keyword n: limit on the number of cross-validations performed (by default
    equals the number of instance filenames)
    
    @keyword log: log Timbl's standard output and error streams to file
    
    @return: list of Timbl output filenames
    """
    if clas_dir:
        makedirs(clas_dir)
        
    if not timbl:
        assert descriptor
        timbl = TimblFile(default_opts=timbl_options_string(descriptor))
    else:
        # ignore descriptor
        assert isinstance(timbl, TimblFile)
        assert "+vo" in timbl.default_opts
        assert "+vdb" in timbl.default_opts
        assert "+vdi" in timbl.default_opts
        assert "-m" in timbl.default_opts
    
    return timbl.cross_validate(inst_fns, test_inst_fns=test_inst_fns,
                                out_fns=out_fns, log_fns=log_fns, options=options, n=n, log=log,
                                out_dir=clas_dir)
Пример #16
0
def pickle(setting):
    if setting.pickle:
        makedirs(setting.pickle_dir)
        log.info("saving pickled setting {0}".format(setting.pickle_fname))
        pkl_file = open(setting.pickle_fname, "wb")
        dump(setting, pkl_file)
Пример #17
0
def exp_dev_fast(setting):
    """
    perform a fast alignment experiment on development data
    
    Weighting, matching and merging takes place per test corpus without
    writing intermediary results to a file.
    """
    assert setting.develop and not setting.validate

    exp_init(setting)

    create_parts(setting)

    # It's impossible to do extraction one corpus a time, because in order to
    # classify a test corpus you need instances for all the other training
    # corpora! Moreover, since Timbl classification is file-based, we need to
    # write the corpus instance files to disk. These files can be huge and
    # keeping all of them in memory seems to offer little benefit.
    extract(setting)

    sample(setting)

    # Timbl writes its output to a file, which then needs to be parsed in
    # order to insert the class predictions and weights into the corpus
    # instances. That means there is no advantage to doing classification
    # one corpus a time.
    classify(setting)

    log.info("\n" + header("WEIGHT/MATCH/MERGE STEP"))
    # reset evaluator
    if setting.evaluate: setting.evaluator.__init__()

    scope = zip(setting.dev_inst_fns, setting.dev_clas_fns,
                setting.dev_true_fns)[:setting.n]

    for inst_fname, out_fname, true_fname in scope:
        log.info("reading corpus instances {0}".format(inst_fname))
        corpus_inst = CorpusInst()
        corpus_inst.loadtxt(inst_fname, setting.descriptor.dtype)

        if setting.weight:
            log.info("reading classifier output {0}".format(out_fname))
            timbl_out = parse_timbl_output(open(out_fname))
            log.info("weighting...")
            weight_corpus(corpus_inst, timbl_out, setting.weight_func)

        if setting.match:
            log.info("matching...")
            match_corpus(corpus_inst, setting.matcher)

        if setting.merge:
            log.info("reading true corpus {0}".format(true_fname))
            true_corpus = ParallelGraphCorpus(inf=true_fname,
                                              graph_loading=LOAD_NONE)
            log.info("merging...")
            pred_corpus = merge_corpus(corpus_inst, true_corpus,
                                       setting.merger)

        if setting.evaluate:
            name = os.path.basename(true_fname).split("_")[0]
            setting.evaluator.add(true_corpus, pred_corpus, name)

    if setting.evaluate:
        log.info("evaluting...")
        setting.evaluator.run_eval()
        log.info("saving evaluation {0}".format(setting.dev_eval_fname))
        makedirs(setting.eval_dir)
        setting.evaluator.write(setting.dev_eval_fname)

    exp_exit(setting)
Пример #18
0
def pickle(setting):
    if setting.pickle:
        makedirs(setting.pickle_dir)
        log.info("saving pickled setting {0}".format(setting.pickle_fname))
        pkl_file = open(setting.pickle_fname, "wb")
        dump(setting, pkl_file)