Python get_symbols 예제들, tokenizer.get_symbols Python 예제들

예제 #1

0

파일 보기

파일: features.py 프로젝트: UCSD-PL/proverbot9001

 def __call__(self, context: TacticContext) -> int:
     if len(context.hypotheses) == 0:
         return 0
     hyp_types = [
         limitNumTokens(serapi_instance.get_hyp_type(hyp), self.max_length)
         for hyp in context.hypotheses
     ]
     goal = limitNumTokens(context.goal, self.max_length)
     closest_hyp_type = max(hyp_types,
                            key=lambda x: SequenceMatcher(None, goal, x).
                            ratio() * len(get_symbols(x)))
     headToken = get_symbols(closest_hyp_type)[0]
     if headToken in self.headKeywords:
         return self.headKeywords.index(headToken) + 1
     else:
         return 0

예제 #2

0

파일 보기

파일: features.py 프로젝트: UCSD-PL/proverbot9001

 def __call__(self, context: TacticContext) -> List[float]:
     identifiers = get_symbols(context.goal)
     locallyBoundInHyps = serapi_instance.get_vars_in_hyps(
         context.hypotheses)
     binders = [
         "forall\s+(.*)(?::.*)?,", "fun\s+(.*)(?::.*)?,", "let\s+\S+\s+:="
     ]
     punctuation = ["(", ")", ":", ",", "_", ":=", "=>", "{|", "|}"]
     locallyBoundInTerm = [
         var for binder_pattern in binders
         for varString in re.findall(binder_pattern, context.goal)
         for var in re.findall("\((\S+)\s+:", varString)
         if var not in punctuation
     ]
     globallyBoundIdentifiers = \
         [ident for ident in identifiers
          if not ident in locallyBoundInHyps + locallyBoundInTerm + punctuation]
     locallyBoundIdentifiers = [
         ident for ident in identifiers
         if not ident in globallyBoundIdentifiers + punctuation
     ]
     for var in locallyBoundInTerm:
         assert var in locallyBoundIdentifiers, \
             "{}, {}".format(globallyBoundIdentifiers, locallyBoundInTerm)
         locallyBoundIdentifiers.remove(var)
     return [
         math.log1p(float(len(locallyBoundIdentifiers))),
         # math.log1p(float(len(globallyBoundIdentifiers))),
         float(len(globallyBoundIdentifiers)) / float(
             len(globallyBoundIdentifiers) + len(locallyBoundIdentifiers))
     ]

예제 #3

0

파일 보기

파일: encstruct_predictor.py 프로젝트: rashchedrin/proverbot9001

def encode_seq_structural_data(data : RawDataset,
                               context_tokenizer_type : \
                               Callable[[List[str], int], Tokenizer],
                               num_keywords : int,
                               num_reserved_tokens: int) -> \
                               Tuple[StructDataset, Tokenizer, SimpleEmbedding]:
    embedding = SimpleEmbedding()

    hyps_and_goals = [
        hyp_or_goal for hyp_and_goal in [
            zip(hyps +
                [goal], itertools.repeat(embedding.encode_token(tactic)))
            for prev_tactics, hyps, goal, tactic in data
        ] for hyp_or_goal in hyp_and_goal
    ]
    context_tokenizer = make_keyword_tokenizer_relevance(
        hyps_and_goals, context_tokenizer_type, num_keywords,
        num_reserved_tokens)
    encodedData = []
    for prev_tactics, hyps, goal, tactic in data:
        stem, rest = serapi_instance.split_tactic(tactic)
        encodedData.append(
            ([context_tokenizer.toTokenList(hyp)
              for hyp in hyps], context_tokenizer.toTokenList(goal),
             (embedding.encode_token(stem),
              [hyp_index(hyps, arg) for arg in get_symbols(rest)])))

    return encodedData, context_tokenizer, embedding

예제 #4

0

파일 보기

파일: features.py 프로젝트: UCSD-PL/proverbot9001

 def __call__(self, context: TacticContext) -> int:
     if context.goal.strip() == "":
         return 0
     headToken = get_symbols(context.goal)[0]
     if headToken in self.headKeywords:
         return self.headKeywords.index(headToken) + 2
     else:
         return 1

예제 #5

0

파일 보기

def numeric_args(in_data : TacticContext, tactic : str,
                 next_in_data : TacticContext,
                 arg_values : argparse.Namespace) -> bool:
    goal_words = get_symbols(in_data.goal)
    stem, rest = serapi_instance.split_tactic(tactic)
    args = get_subexprs(rest.strip("."))
    for arg in args:
        if not re.fullmatch("\d+", arg):
            return False
    return True

예제 #6

0

파일 보기

파일: copyarg_predictor.py 프로젝트: UCSD-PL/proverbot9001

def get_arg_idx(max_length: int, inter: ScrapedTactic) -> int:
    tactic_stem, tactic_rest = serapi_instance.split_tactic(inter.tactic)
    symbols = tokenizer.get_symbols(inter.context.focused_goal)
    arg = tactic_rest.split()[0].strip(".")
    assert arg in symbols, "tactic: {}, arg: {}, goal: {}, symbols: {}"\
        .format(inter.tactic, arg, inter.context.focused_goal, symbols)
    idx = symbols.index(arg)
    if idx >= max_length:
        return 0
    else:
        return idx + 1

예제 #7

0

파일 보기

파일: context_filter.py 프로젝트: rashchedrin/proverbot9001

def args_token_in_goal(in_data: ContextData, tactic: str,
                       next_in_data: ContextData,
                       arg_values: argparse.Namespace) -> bool:
    goal = in_data["goal"]
    goal_words = get_symbols(cast(str, goal))[:arg_values.max_length]
    stem, rest = serapi_instance.split_tactic(tactic)
    args = get_subexprs(rest.strip("."))
    for arg in args:
        if not arg in goal_words:
            return False
    return True

예제 #8

0

파일 보기

파일: features.py 프로젝트: rashchedrin/proverbot9001

 def __init__(self, init_dataset: List[TacticContext],
              args: argparse.Namespace) -> None:
     headTokenCounts: typing.Counter[str] = Counter()
     for prev_tactics, hyps, goal in init_dataset:
         headToken = get_symbols(goal)[0]
         headTokenCounts[headToken] += 1
     self.headKeywords = [
         word for word, count in headTokenCounts.most_common(
             args.num_head_keywords)
     ]
     eprint("Goal head keywords are {}".format(self.headKeywords),
            guard=args.print_keywords)

예제 #9

0

파일 보기

파일: copyarg_predictor.py 프로젝트: UCSD-PL/proverbot9001

def get_stem_and_arg_idx(max_length: int, embedding: Embedding,
                         inter: ScrapedTactic) -> Tuple[int, int]:
    tactic_stem, tactic_rest = serapi_instance.split_tactic(inter.tactic)
    stem_idx = embedding.encode_token(tactic_stem)
    symbols = tokenizer.get_symbols(inter.context.focused_goal)
    arg = tactic_rest.split()[0].strip(".")
    assert arg in symbols, "tactic: {}, arg: {}, goal: {}, symbols: {}"\
        .format(inter.tactic, arg, inter.context.focused_goal, symbols)
    idx = symbols.index(arg)
    if idx >= max_length:
        return stem_idx, 0
    else:
        return stem_idx, idx + 1

예제 #10

0

파일 보기

파일: features.py 프로젝트: rashchedrin/proverbot9001

 def __init__(self, init_dataset: List[TacticContext],
              args: argparse.Namespace) -> None:
     self.max_length = args.max_length
     headTokenCounts: typing.Counter[str] = Counter()
     for prev_tactics, hyps, goal in init_dataset:
         for hyp in hyps:
             headToken = get_symbols(serapi_instance.get_hyp_type(hyp))[0]
             headTokenCounts[headToken] += 1
     self.headKeywords = [
         word for word, count in headTokenCounts.most_common(
             args.num_head_keywords)
     ]
     eprint("Hypothesis head keywords are {}".format(self.headKeywords),
            guard=args.print_keywords)

예제 #11

0

파일 보기

 def predictKTactics(self, in_data : TacticContext, k : int) -> List[Prediction]:
     if len(in_data.hypotheses) == 0:
         return [Prediction("eauto", 0)]
     k = min(k, len(in_data.hypotheses))
     best_hyps = \
         sorted(in_data.hypotheses,
                reverse=True,
                key=lambda hyp:
                SequenceMatcher(None,
                                tokenizer.get_symbols(
                                    serapi_instance.get_hyp_type(hyp)),
                                in_data.goal).ratio() * len(hyp)
         )[:k]
     return [Prediction("apply " + serapi_instance.get_first_var_in_hyp(hyp) + ".",
                        .5 ** idx) for idx, hyp in enumerate(best_hyps)]

예제 #12

0

파일 보기

 def __init__(self, init_dataset : List[TacticContext],
              args : argparse.Namespace) -> None:
     headTokenCounts : typing.Counter[str] = Counter()
     for relevant_lemmas, prev_tactics, hyps, goal in init_dataset:
         headToken = get_symbols(goal)[0]
         headTokenCounts[headToken] += 1
     if args.load_head_keywords and Path2(args.load_head_keywords).exists():
         self.headKeywords = torch.load(args.load_head_keywords)
     else:
         self.headKeywords = [word for word, count in
                              headTokenCounts.most_common(args.num_head_keywords)]
     if args.save_head_keywords:
         torch.save(self.headKeywords, args.save_head_keywords)
     eprint("Head keywords are {}".format(self.headKeywords),
            guard=args.print_keywords)

예제 #13

0

파일 보기

 def from_data(init_dataset: List[TacticContext],
               args: argparse.Namespace) -> 'TopLevelTokenInBestHyp':
     headTokenCounts: typing.Counter[str] = Counter()
     for relevant_lemmas, prev_tactics, hyps, goal in init_dataset:
         for hyp in hyps:
             headToken = get_symbols(serapi_instance.get_hyp_type(hyp))[0]
             headTokenCounts[headToken] += 1
     if args.load_head_keywords and Path2(args.load_head_keywords).exists():
         result = TopLevelTokenInBestHyp(
             args, torch.load(args.load_head_keywords))
     else:
         result = TopLevelTokenInBestHyp(
             [word for word, count in
              headTokenCounts.most_common(args.num_head_keywords)])
     eprint("Hypothesis head keywords are {}".format(result.headKeywords),
            guard=args.print_keywords)
     return result

예제 #14

0

파일 보기

def args_token_in_goal(in_data : TacticContext, tactic : str,
                       next_in_data : TacticContext,
                       arg_values : argparse.Namespace) -> bool:
    goal_words = get_symbols(in_data.goal)[:arg_values.max_length]
    stem, rest = serapi_instance.split_tactic(tactic)
    if len(rest) > 0 and rest[-1] == '.':
        rest = rest[:-1]
    args = get_subexprs(rest)
    # While the arguments to an intro(s) might *look* like
    # goal arguments, they are actually fresh variables
    if (stem == "intros" or stem == "intro") and len(args) > 0:
        return False
    for arg in args:
        if not any([serapi_instance.symbol_matches(goal_word, arg)
                    for goal_word in goal_words]):
            return False
    return True

예제 #15

0

파일 보기

 def from_data(init_dataset: List[TacticContext],
               args: argparse.Namespace) -> 'TopLevelTokenInGoal':
     headTokenCounts: typing.Counter[str] = Counter()
     for relevant_lemmas, prev_tactics, hyps, goal in init_dataset:
         if goal.strip() == "":
             continue
         headToken = get_symbols(goal)[0]
         headTokenCounts[headToken] += 1
     if args.load_head_keywords and Path2(args.load_head_keywords).exists():
         result = TopLevelTokenInGoal(torch.load(args.load_head_keywords))
     else:
         result = TopLevelTokenInGoal(
             [word for word, count in
              headTokenCounts.most_common(args.num_head_keywords)])
     if args.save_head_keywords:
         torch.save(result.headKeywords, args.save_head_keywords)
     eprint("Goal head keywords are {}".format(result.headKeywords),
            guard=args.print_keywords)
     return result

예제 #16

0

파일 보기

파일: features_q_estimator.py 프로젝트: UCSD-PL/proverbot9001

 def _encode_action(self, context: TacticContext, action: str) \
         -> Tuple[int, int]:
     stem, argument = serapi_instance.split_tactic(action)
     stem_idx = emap_lookup(self.tactic_map, 32, stem)
     all_premises = context.hypotheses + context.relevant_lemmas
     stripped_arg = argument.strip(".").strip()
     if stripped_arg == "":
         arg_idx = 0
     else:
         index_hyp_vars = dict(
             serapi_instance.get_indexed_vars_in_hyps(all_premises))
         if stripped_arg in index_hyp_vars:
             hyp_varw, _, rest = all_premises[index_hyp_vars[stripped_arg]]\
                 .partition(":")
             arg_idx = emap_lookup(self.token_map, 128,
                                   tokenizer.get_words(rest)[0]) + 2
         else:
             goal_symbols = tokenizer.get_symbols(context.goal)
             if stripped_arg in goal_symbols:
                 arg_idx = emap_lookup(self.token_map, 128,
                                       stripped_arg) + 128 + 2
             else:
                 arg_idx = 1
     return stem_idx, arg_idx

예제 #17

0

파일 보기

def score_hyp_type(goal: str, hyp_type: str, max_length: int):
    ratio = SequenceMatcher(None, goal, hyp_type).ratio()
    score = ratio * (len(get_symbols(hyp_type)) / max_length)
    return score

예제 #18

0

파일 보기

파일: features.py 프로젝트: UCSD-PL/proverbot9001

 def __call__(self, context: TacticContext) -> List[float]:
     headToken = get_symbols(context.goal)[0]
     oneHotHeads = [0.] * len(self.headKeywords)
     if headToken in self.headKeywords:
         oneHotHeads[self.headKeywords.index(headToken)] = 1.
     return oneHotHeads

예제 #19

0

파일 보기

파일: copyarg_predictor.py 프로젝트: UCSD-PL/proverbot9001

def get_arg_from_token_idx(goal: str, idx: int) -> str:
    goal_symbols = tokenizer.get_symbols(goal.strip("."))
    if idx < len(goal_symbols):
        return goal_symbols[idx]
    else:
        return ""

예제 #20

0

파일 보기

파일: features.py 프로젝트: rashchedrin/proverbot9001

 def __call__(self, context: TacticContext) -> int:
     headToken = get_symbols(context.goal)[0]
     if headToken in self.headKeywords:
         return self.headKeywords.index(headToken) + 1
     else:
         return 0