Python LCFRS.add_gram 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: grammar.lcfrs

클래스/타입: LCFRS

메소드/함수: add_gram

hotexamples.com에서의 예제들: 5

Python LCFRS.add_gram - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 grammar.lcfrs.LCFRS.add_gram에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

LCFRS(17)

make_proper(11)

add_rule(10)

add_gram(5)

nonts(3)

rules(2)

rule_index(1)

예제 #1

파일 보기

    def test_basic_sdcp_parsing_constituency(self):
        tree1 = constituent_tree_1()
        tree2 = constituent_tree_2()
        tree3 = constituent_tree_1_pos_stripped()

        terminal_labeling = FormTerminals() # [tree1, tree2], 1, filter=["VP"])
        fanout = 1

        grammar = LCFRS('START')
        for tree in [tree1, tree2]:
            tree_part = tree.unlabelled_structure()
            part = fanout_limited_partitioning(tree_part, fanout)
            tree_grammar = fringe_extract_lcfrs(tree, part, naming='child', term_labeling=terminal_labeling)
            grammar.add_gram(tree_grammar)
        grammar.make_proper()

        print("grammar induced. Printing rules...", file=stderr)

        for rule in grammar.rules():
            print(rule, file=stderr)

        parser_type = LCFRS_sDCP_Parser

        print("preprocessing grammar", file=stderr)

        parser_type.preprocess_grammar(grammar, terminal_labeling, debug=True)

        print("invoking parser", file=stderr)

        parser = parser_type(grammar, tree1)

        print("listing derivations", file=stderr)

        for der in parser.all_derivation_trees():
            print(der)
            output_tree = ConstituentTree(tree1.sent_label())
            tokens = [construct_constituent_token(token.form(), '--', True) for token in tree1.token_yield()]
            dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens, False,
                              construct_constituent_token)
            print(tree1)
            print(output_tree)

        parser = parser_type(grammar, tree3)
        print(parser.recognized())
        for der in parser.all_derivation_trees():
            print(der)
            output_tree = ConstituentTree(tree3.sent_label())
            tokens = [construct_constituent_token(token.form(), '--', True) for token in tree3.token_yield()]
            dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens, False,
                              construct_constituent_token)
            print(tree3)
            print(output_tree)

        print("completed test", file=stderr)

예제 #2

파일 보기

def induction_on_a_corpus(dsgs, rec_part_strategy, nonterminal_labeling, terminal_labeling, start="START",
                          normalize=True):
    grammar = LCFRS(start=start)
    for dsg in dsgs:
        rec_part = rec_part_strategy(dsg)
        # if calc_fanout(rec_part) > 1 or calc_rank(rec_part) > 2:
        #     rec_part = rec_part_strategy(dsg)
        #     assert False
        decomp = compute_decomposition(dsg, rec_part)
        dsg_grammar = induce_grammar_from(dsg, rec_part, decomp, nonterminal_labeling, terminal_labeling, terminal_labeling, start,
                                          normalize)
        grammar.add_gram(dsg_grammar)
    return grammar

예제 #3

파일 보기

파일: base_experiment.py 프로젝트: parsingPhilipp/panda-parser

 def induce_grammar(self, corpus, start="START"):
     grammar = LCFRS(start=start)
     for obj in corpus:
         obj = self.preprocess_before_induction(obj)
         obj_grammar, features = self.induce_from(obj)
         if obj_grammar is None:
             continue
         if features is None:
             grammar.add_gram(obj_grammar, None)
         else:
             grammar.add_gram(obj_grammar, (self.feature_log, features))
     self.postprocess_grammar(grammar)
     self.base_grammar = grammar
     _, path = tempfile.mkstemp(suffix=".base.grammar", dir=self.directory)
     with open(path, 'wb') as f:
         pickle.dump(self.base_grammar, f)
         self.stage_dict["base_grammar"] = path

예제 #4

파일 보기

    def test_negra_dag_small_grammar(self):
        DAG_CORPUS = 'res/tiger/tiger_full_with_sec_edges.export'
        DAG_CORPUS_BIN = 'res/tiger/tiger_full_with_sec_edges_bin_h1_v1.export'
        names = list([str(i) for i in range(1, 101)])
        if not os.path.exists(DAG_CORPUS):
            print(
                'run the following command to create an export corpus with dags:'
            )
            print('\tPYTHONPATH=. util/tiger_dags_to_negra.py ' +
                  'res/tiger/tiger_release_aug07.corrected.16012013.xml ' +
                  DAG_CORPUS + ' 1 50474')
        self.assertTrue(os.path.exists(DAG_CORPUS))

        if not os.path.exists(DAG_CORPUS_BIN):
            print(
                'run the following command to binarize the export corpus with dags:'
            )
            print("discodop treetransforms --binarize -v 1 -h 1 " +
                  DAG_CORPUS + " " + DAG_CORPUS_BIN)
            # _, DAG_CORPUS_BIN = tempfile.mkstemp(prefix='corpus_bin_', suffix='.export')
            # subprocess.call(["discodop", "treetransforms", "--binarize", "-v", "1", "-h", "1", DAG_CORPUS, DAG_CORPUS_BIN])
        self.assertTrue(os.path.exists(DAG_CORPUS_BIN))
        corpus = np.sentence_names_to_hybridtrees(names,
                                                  DAG_CORPUS,
                                                  secedge=True)
        corpus_bin = np.sentence_names_to_hybridtrees(names,
                                                      DAG_CORPUS_BIN,
                                                      secedge=True)

        grammar = LCFRS(start="START")

        for hybrid_dag, hybrid_dag_bin in zip(corpus, corpus_bin):
            self.assertEqual(len(hybrid_dag.token_yield()),
                             len(hybrid_dag_bin.token_yield()))

            dag_grammar = direct_extract_lcfrs_from_prebinarized_corpus(
                hybrid_dag_bin)
            grammar.add_gram(dag_grammar)

        grammar.make_proper()
        print(
            "Extracted LCFRS/DCP-hybrid grammar with %i nonterminals and %i rules"
            % (len(grammar.nonts()), len(grammar.rules())))

        parser = DiscodopKbestParser(grammar, k=1)

        _, RESULT_FILE = tempfile.mkstemp(prefix='parser_results_',
                                          suffix='.export')

        with open(RESULT_FILE, 'w') as results:
            for hybrid_dag in corpus:

                poss = list(map(lambda x: x.pos(), hybrid_dag.token_yield()))
                parser.set_input(poss)
                parser.parse()
                self.assertTrue(parser.recognized())
                der = parser.best_derivation_tree()

                dcp_term = DCP_evaluator(der).getEvaluation()
                dag_eval = HybridDag(hybrid_dag.sent_label())
                dcp_to_hybriddag(dag_eval,
                                 dcp_term,
                                 copy.deepcopy(hybrid_dag.token_yield()),
                                 False,
                                 construct_token=construct_constituent_token)
                lines = np.serialize_hybridtrees_to_negra(
                    [dag_eval], 1, 500, use_sentence_names=True)
                for line in lines:
                    print(line, end='', file=results)
                parser.clear()

        print("Wrote results to %s" % RESULT_FILE)

예제 #5

파일 보기

def main():
    # # induce or load grammar
    # if not os.path.isfile(grammar_path):
    #     grammar = LCFRS('START')
    #     for tree in train_corpus:
    #         if not tree.complete() or tree.empty_fringe():
    #             continue
    #         part = recursive_partitioning(tree)
    #         tree_grammar = fringe_extract_lcfrs(tree, part, naming='child', term_labeling=terminal_labeling)
    #         grammar.add_gram(tree_grammar)
    #     grammar.make_proper()
    #     pickle.dump(grammar, open(grammar_path, 'wb'))
    # else:
    #     grammar = pickle.load(open(grammar_path, 'rb'))

    grammar = LCFRS('START')
    for tree in train_corpus:
        if not tree.complete() or tree.empty_fringe():
            continue
        part = recursive_partitioning(tree)
        tree_grammar = fringe_extract_lcfrs(tree,
                                            part,
                                            naming='child',
                                            term_labeling=terminal_labeling)
        grammar.add_gram(tree_grammar)
    grammar.make_proper()

    # # compute or load reducts
    # if not os.path.isfile(reduct_path):
    #     traceTrain = compute_reducts(grammar, train_corpus, terminal_labeling)
    #     traceTrain.serialize(reduct_path)
    # else:
    #     traceTrain = PySDCPTraceManager(grammar, terminal_labeling)
    #     traceTrain.load_traces_from_file(reduct_path)

    traceTrain = compute_reducts(grammar, train_corpus, terminal_labeling)
    traceValidationGenetic = compute_reducts(grammar,
                                             validation_genetic_corpus,
                                             terminal_labeling)
    traceValidation = compute_reducts(grammar, validation_corpus,
                                      terminal_labeling)

    # prepare EM training
    grammarInfo = PyGrammarInfo(grammar, traceTrain.get_nonterminal_map())
    if not grammarInfo.check_for_consistency():
        print("[Genetic] GrammarInfo is not consistent!")

    storageManager = PyStorageManager()

    em_builder = PySplitMergeTrainerBuilder(traceTrain, grammarInfo)
    em_builder.set_em_epochs(em_epochs)
    em_builder.set_simple_expector(threads=threads)
    emTrainer = em_builder.build()

    # randomize initial weights and do em training
    la_no_splits = build_PyLatentAnnotation_initial(grammar, grammarInfo,
                                                    storageManager)
    la_no_splits.add_random_noise(seed=seed)
    emTrainer.em_train(la_no_splits)
    la_no_splits.project_weights(grammar, grammarInfo)

    # emTrainerOld = PyEMTrainer(traceTrain)
    # emTrainerOld.em_training(grammar, 30, "rfe", tie_breaking=True)

    # compute parses for validation set
    baseline_parser = GFParser_k_best(grammar, k=k_best)
    validator = build_score_validator(grammar, grammarInfo,
                                      traceTrain.get_nonterminal_map(),
                                      storageManager, terminal_labeling,
                                      baseline_parser, validation_corpus,
                                      validationMethod)
    del baseline_parser

    # prepare SM training
    builder = PySplitMergeTrainerBuilder(traceTrain, grammarInfo)
    builder.set_em_epochs(em_epochs)
    builder.set_split_randomization(1.0, seed + 1)
    builder.set_simple_expector(threads=threads)
    builder.set_score_validator(validator, validationDropIterations)
    builder.set_smoothing_factor(smoothingFactor=smoothing_factor)
    builder.set_split_randomization(percent=split_randomization)
    splitMergeTrainer = builder.set_scc_merger(threshold=scc_merger_threshold,
                                               threads=threads).build()

    splitMergeTrainer.setMaxDrops(validationDropIterations, mode="smoothing")
    splitMergeTrainer.setEMepochs(em_epochs, mode="smoothing")

    # set initial latent annotation
    latentAnnotations = []
    for i in range(0, genetic_initial):
        splitMergeTrainer.reset_random_seed(seed + i + 1)
        la = splitMergeTrainer.split_merge_cycle(la_no_splits)
        if not la.check_for_validity():
            print('[Genetic] Initial LA', i,
                  'is not consistent! (See details before)')
        if not la.is_proper():
            print('[Genetic] Initial LA', i, 'is not proper!')
        heapq.heappush(
            latentAnnotations,
            (evaluate_la(grammar, grammarInfo, la, traceValidationGenetic,
                         validation_genetic_corpus), i, la))
        print('[Genetic]    added initial LA', i)
    (fBest, idBest, laBest) = min(latentAnnotations)
    validation_score = evaluate_la(grammar, grammarInfo, laBest,
                                   traceValidation, test_corpus)
    print("[Genetic] Started with best F-Score (Test) of", validation_score,
          "from Annotation ", idBest)

    geneticCount = genetic_initial
    random.seed(seed)
    for round in range(1, genetic_cycles + 1):
        print("[Genetic] Starting Recombination Round ", round)
        # newpopulation = list(latentAnnotations)
        newpopulation = []
        # Cross all candidates!
        for leftIndex in range(0, len(latentAnnotations)):
            (fLeft, idLeft, left) = latentAnnotations[leftIndex]
            # TODO: How to determine NTs to keep?

            # do SM-Training
            print("[Genetic] do SM-training on", idLeft, "and create LA",
                  geneticCount)
            la = splitMergeTrainer.split_merge_cycle(la)
            if not la.check_for_validity():
                print(
                    '[Genetic] Split/Merge introduced invalid weights into LA',
                    geneticCount)
            if not la.is_proper():
                print(
                    '[Genetic] Split/Merge introduced problems with properness of LA',
                    geneticCount)

            fscore = evaluate_la(grammar, grammarInfo, la,
                                 traceValidationGenetic,
                                 validation_genetic_corpus)
            print("[Genetic] LA", geneticCount, "has F-score: ", fscore)
            heapq.heappush(newpopulation, (fscore, geneticCount, la))
            geneticCount += 1
        heapq.heapify(newpopulation)
        latentAnnotations = heapq.nsmallest(
            genetic_population, heapq.merge(latentAnnotations, newpopulation))
        heapq.heapify(latentAnnotations)
        (fBest, idBest, laBest) = min(latentAnnotations)
        validation_score = evaluate_la(grammar, grammarInfo, laBest,
                                       traceValidation, test_corpus)
        print("[Genetic] Best LA", idBest, "has F-Score (Test) of ",
              validation_score)