Exemplo n.º 1
0
 def _search(self, string, tokstream):
     self.begin = 0
     self.end = 0
     n = len(tokstream)
     tracer = TokenTracer(self.langlet, self.symbol)
     initial = tracer.selectables()
     i = 0
     while i<n:
         tok   = tokstream[i]
         if tok[0] in initial:
             selection = []
             K = None
             for j,T in enumerate(tokstream[i:]):
                 if not self.accept_token(T):
                     continue
                 try:
                     selection = tracer.select(T[0])
                 except NonSelectableError:
                     if K is not None:
                         stream = tokstream[i:i+K+1]
                         if self.condition(stream):
                             m = CMatchObject(string)
                             first = stream[0]
                             last  = stream[-1]
                             m.begin = get_index(string, first[2])+first[-1][0]+1
                             m.end   = get_index(string, last[2])+last[-1][1]+1
                             m.matched = string[m.begin: m.end]
                             m.tokstream = TokenStream(stream)
                             m.tokpos    = i
                             return m
                     break
                 if FIN in selection:
                     K = j
             tracer = TokenTracer(self.langlet, self.symbol)
         i+=1
Exemplo n.º 2
0
 def _check_gene(self, gene):
     tr = TokenTracer(self.langlet)
     try:
         res, idx = tr.check(gene)
     except (KeyError, TypeError):
         print gene
         raise
     return res
Exemplo n.º 3
0
 def _lookahead(self, tokstream, S, sym):
     tokstream = tokstream.clone()
     tracer_data = {}
     tts = []
     for s in S:
         if is_symbol(s):
             tt = TokenTracer(self.langlet, s)
         else:
             # jump to state with nid = s within the current NFA
             states = self._get_states(s, sym)
             if not states:
                 return -1
             tt = TokenTracer(self.langlet,
                              sym,
                              jump_to_state=states,
                              without_expansion=False)
         tts.append(tt)
         tracer_data[tt] = (s, -1)
     n = len(tokstream)
     p = tokstream.position
     m = -1
     while p < n:
         nid = tokstream[p][0]
         if nid == INTRON_NID:
             p += 1
             continue
         removable = []
         for tt in tts:
             selection = tt.selectables()
             if FIN in selection:
                 s, _ = tracer_data[tt]
                 tracer_data[tt] = (s, p)
                 m = p
             if nid not in selection:
                 removable.append(tt)
             else:
                 tt.select(nid)
         for tt in removable:
             tts.remove(tt)
         if len(tts) == 1:
             s, q = tracer_data[tts[0]]
             if q >= 0:
                 return (tracer_data[tts[0]][0], p, None)
         elif len(tts) == 0:
             if p > m:
                 ttcancel = removable[-1]
                 s, _ = tracer_data[ttcancel]
                 self._last_scan_point = (s, p, tokstream[p],
                                          ttcancel.selectables())
             if m >= 0:
                 for tt, (s, i) in tracer_data.items():
                     if i == m:
                         return (s, p, None)
             selectable = set()
             for tt in removable:
                 selectable.update(tt.selectables())
             return (-1, p, selectable)
         p += 1
     return (-1, p, set())
Exemplo n.º 4
0
    def gen_token_string(self, nid):
        tracer = TokenTracer(self.langlet, nid, "lex")
        selection = list(tracer.selectables())
        S = []
        while True:
            n = len(selection)
            if selection == [FIN]:
                return ''.join(S)
            if len(S) > 20:
                return self.gen_token_string(nid)
            while True:
                m = random.randrange(0, n)
                t = selection[m]
                if t is FIN:
                    continue
                try:
                    chars = list(self.lexer_terminal[t])
                except KeyError:
                    return ''
                if not chars:
                    other_chars = reduce(lambda S, T: S.union(T), [
                        self.lexer_terminal.get(r, set())
                        for r in selection if r != t
                    ], set())
                    while True:
                        c = random_printable()
                        if c == '\\':
                            continue
                        if c not in other_chars:
                            break
                    S.append(c)
                else:
                    c = chars[random.randrange(0, len(chars))]
                    S.append(c)
                selection = list(tracer.select(t))
                break

            if len(S) >= self.stdlen:
                if FIN in selection:
                    if S[0] in ('"', "'"):
                        if len(S) >= 4:
                            return ''.join(S)
                    if S[0] in string.digits:
                        if len(S) >= 4:
                            return ''.join(S)
                    if random.randrange(0, 2) == 0:
                        return ''.join(S)
Exemplo n.º 5
0
 def delete(self, g):
     visited = set()
     while True:
         n = len(g) - 1
         if len(visited)>=n:
             return g
         gene = g[:]
         k = random.randrange(0, n)
         visited.add(k)
         T = gene[k+1][0]
         del gene[k+1]
         n-=1
         R = self.get_right_par(T+SYMBOL_OFFSET)  # TODO: consider 'extended braces'
         loc = []
         if R:
             R-=SYMBOL_OFFSET
             for i, tok in enumerate(gene[k+1:]):
                 if tok[0] == R:
                     loc.append(i+k+1)
         else:
             L = self.get_left_par(T+SYMBOL_OFFSET)
             if L:
                 L-=SYMBOL_OFFSET
                 for i, tok in enumerate(gene[:k]):
                     if tok[0] == L:
                         loc.append(i)
         if loc:
             while loc:
                 m = loc[random.randrange(0, len(loc))]
                 backup = gene[m]
                 del gene[m]
                 tr = TokenTracer(self.langlet)
                 res, idx = tr.check(gene)
                 if res == True:
                     return gene
                 else:
                     loc.remove(m)
                     gene.insert(m, backup)
             continue
         else:
             if self._check_gene(gene):
                 return gene
             else:
                 continue
Exemplo n.º 6
0
    def gen_token_string(self, nid):
        tracer = TokenTracer(self.langlet, nid, "lex")
        selection = list(tracer.selectables())
        S = []
        while True:
            n = len(selection)
            if selection == [FIN]:
                return ''.join(S)
            if len(S)>20:
                return self.gen_token_string(nid)
            while True:
                m = random.randrange(0, n)
                t = selection[m]
                if t is FIN:
                    continue
                try:
                    chars = list(self.lexer_terminal[t])
                except KeyError:
                    return ''
                if not chars:
                    other_chars = reduce(lambda S, T: S.union(T), [self.lexer_terminal.get(r, set()) for r in selection if r!=t], set())
                    while True:
                        c = random_printable()
                        if c == '\\':
                            continue
                        if c not in other_chars:
                            break
                    S.append(c)
                else:
                    c = chars[random.randrange(0, len(chars))]
                    S.append(c)
                selection = list(tracer.select(t))
                break

            if len(S) >= self.stdlen:
                if FIN in selection:
                    if S[0] in ('"', "'"):
                        if len(S)>=4:
                            return ''.join(S)
                    if S[0] in string.digits:
                        if len(S)>=4:
                            return ''.join(S)
                    if random.randrange(0,2) == 0:
                        return ''.join(S)
Exemplo n.º 7
0
 def insert(self, g):
     trials = set()
     while True:
         gene = g[:]
         n = len(gene) - 1
         k, T, tracer = self._seek_random_item(gene, trials)
         if T is None:
             continue
         value = self.gen_token_string(T+SYMBOL_OFFSET)
         gene.insert(k+1, [T, value])
         n+=1
         R = self.get_right_par(T+SYMBOL_OFFSET)  # TODO: consider 'extended braces'
         if R:
             R-=SYMBOL_OFFSET
             i = 1
             loc = []
             while k+i<n:
                 try:
                     selection = tracer.select(gene[k+i][0])
                 except NonSelectableError:
                     break
                 if R in selection:
                     loc.append(k+i)
                 i+=1
             if loc:
                 value = self.gen_token_string(R+SYMBOL_OFFSET)
                 while loc:
                     m = loc[random.randrange(0, len(loc))]
                     gene.insert(m+1, [R, value])
                     tr = TokenTracer(self.langlet)
                     res, idx = tr.check(gene)
                     if res == True:
                         return gene
                     else:
                         loc.remove(m)
                 continue
             else:
                 continue
         else:
             if self._check_gene(gene):
                 return gene
             else:
                 continue
Exemplo n.º 8
0
 def _repair(self, tokenstream):
     '''
     Maybe the tokenstream needs some repair? Insert constant token, when needed.
     '''
     n = len(tokenstream)
     tt = TokenTracer(self.langlet, start=self.start_symbol)
     selectables = tt.selectables()
     repaired = []
     i = 0
     while i < n:
         tok = tokenstream[i]
         # print tok
         if tok[0] in selectables:
             repaired.append(tok)
             selectables = tt.select(tok[0])
         else:
             # find a const token T which can be inserted between token[i] and token[i+1]
             S = []
             for s in selectables:
                 if s in self.constants:
                     _tt = tt.clone()
                     _selectables = _tt.select(s)
                     if i == n - 1:
                         if FIN in _selectables:
                             repaired.append([s, self.constants[s]])
                             break
                     else:
                         T = tokenstream[i + 1]
                         if T[0] in _selectables:
                             S.append(s)
                             selectables = _selectables
             else:
                 if S == []:
                     if tok[1].strip() == "":  # forgotten a linebreak?
                         i += 1
                         continue
                     # TODO: replace this by an expressive error message
                     self.compute_syntax_error(tokenstream, i)
                 elif len(S) == 1:
                     selectables = _selectables
                     repaired.append([s, self.constants[s]])
         i += 1
     return repaired
Exemplo n.º 9
0
 def _repair(self, tokenstream):
     '''
     Maybe the tokenstream needs some repair? Insert constant token, when needed.
     '''
     n  = len(tokenstream)
     tt = TokenTracer(self.langlet, start = self.start_symbol)
     selectables = tt.selectables()
     repaired = []
     i = 0
     while i<n:
         tok = tokenstream[i]
         # print tok
         if tok[0] in selectables:
             repaired.append(tok)
             selectables = tt.select(tok[0])
         else:
             # find a const token T which can be inserted between token[i] and token[i+1]
             S = []
             for s in selectables:
                 if s in self.constants:
                     _tt = tt.clone()
                     _selectables = _tt.select(s)
                     if i == n-1:
                         if FIN in _selectables:
                             repaired.append([s, self.constants[s]])
                             break
                     else:
                         T = tokenstream[i+1]
                         if T[0] in _selectables:
                             S.append(s)
                             selectables = _selectables
             else:
                 if S == []:
                     if tok[1].strip() == "":  # forgotten a linebreak?
                         i+=1
                         continue
                     # TODO: replace this by an expressive error message
                     self.compute_syntax_error(tokenstream, i)
                 elif len(S) == 1:
                     selectables = _selectables
                     repaired.append([s, self.constants[s]])
         i+=1
     return repaired
Exemplo n.º 10
0
 def _seek_random_item(self, gene, trials):
      n = len(gene) - 1
      if len(trials) == n:
          trials.clear()
      while True:
          k = random.randrange(-1, n)
          if k not in trials:
              break
      trials.add(k)
      tracer = TokenTracer(self.langlet)
      selection = []
      for i, tok in enumerate(gene):
          if i<=k:
              tracer.select(tok[0])
          else:
              break
      selection = list(tracer.selectables())
      m = random.randrange(0, len(selection))
      T = selection[m]
      return k, T, tracer
Exemplo n.º 11
0
    def run(self, start=None, maxlen=3, exclude = ()):
        ttracer = TokenTracer(self.langlet, start = start)
        L = []

        def create_trace(ttracer, selection, L, n):
            R = []
            if n == 0:
                if None in selection:
                    return [L]
                else:
                    return []
            for s in selection:
                if s is None:
                    R.append(L)
                elif s not in exclude:
                    subtracer = ttracer.clone()
                    subselect = subtracer.select(s)
                    R+=create_trace(subtracer, subselect, L+[s], n-1)
            return R

        R = []
        L = []
        traces = create_trace(ttracer, ttracer.selectables(), L, maxlen)
        return traces
Exemplo n.º 12
0
 def _lookahead(self, tokstream, S, sym):
     tokstream = tokstream.clone()
     tracer_data = {}
     tts = []
     for s in S:
         if is_symbol(s):
             tt = TokenTracer(self.langlet, s)
         else:
             # jump to state with nid = s within the current NFA
             states = self._get_states(s, sym)
             if not states:
                 return -1
             tt = TokenTracer(self.langlet, sym, jump_to_state = states, without_expansion = False)
         tts.append(tt)
         tracer_data[tt] = (s, -1)
     n = len(tokstream)
     p = tokstream.position
     m = -1
     while p<n:
         nid = tokstream[p][0]
         if nid == INTRON_NID:
             p+=1
             continue
         removable = []
         for tt in tts:
             selection = tt.selectables()
             if FIN in selection:
                 s, _ = tracer_data[tt]
                 tracer_data[tt] = (s, p)
                 m = p
             if nid not in selection:
                 removable.append(tt)
             else:
                 tt.select(nid)
         for tt in removable:
             tts.remove(tt)
         if len(tts) == 1:
             s, q = tracer_data[tts[0]]
             if q>=0:
                 return (tracer_data[tts[0]][0], p, None)
         elif len(tts) == 0:
             if p > m:
                 ttcancel = removable[-1]
                 s, _ = tracer_data[ttcancel]
                 self._last_scan_point = (s, p, tokstream[p], ttcancel.selectables())
             if m >= 0:
                 for tt, (s, i) in tracer_data.items():
                     if i == m:
                         return (s, p, None)
             selectable = set()
             for tt in removable:
                 selectable.update(tt.selectables())
             return (-1, p, selectable)
         p+=1
     return (-1, p, set())
Exemplo n.º 13
0
 def subst(self, g):
     trials = set()
     n = len(g) - 1
     while True:
         gene = g[:]
         k = random.randrange(-1, n)
         tracer = TokenTracer(self.langlet)
         for i, tok in enumerate(gene):
             if i<=k:
                 tracer.select(tok[0])
             else:
                 break
         selection = list(tracer.selectables())
         while k+1<n:
             if len(selection) == 1:
                 k+=1
                 selection = list(tracer.select(gene[k][0]))
                 continue
             while selection:
                 m = random.randrange(0, len(selection))
                 T = selection[m]
                 selection.remove(T)
                 if T is None:
                     continue
                 value = self.gen_token_string(T+SYMBOL_OFFSET)
                 backup = gene[k+1]
                 if backup[1] == value:
                     continue
                 gene[k+1] = [T, value]
                 tr = TokenTracer(self.langlet)
                 try:
                     res, idx = tr.check(gene)
                 except (KeyError, TypeError):
                     print gene
                     raise
                 if res == True:
                     return gene
                 else:
                     gene[k+1] = backup
             k+=1
Exemplo n.º 14
0
 def _search(self, string, tokstream):
     self.begin = 0
     self.end = 0
     n = len(tokstream)
     tracer = TokenTracer(self.langlet, self.symbol)
     initial = tracer.selectables()
     i = 0
     while i < n:
         tok = tokstream[i]
         if tok[0] in initial:
             selection = []
             K = None
             for j, T in enumerate(tokstream[i:]):
                 if not self.accept_token(T):
                     continue
                 try:
                     selection = tracer.select(T[0])
                 except NonSelectableError:
                     if K is not None:
                         stream = tokstream[i:i + K + 1]
                         if self.condition(stream):
                             m = CMatchObject(string)
                             first = stream[0]
                             last = stream[-1]
                             m.begin = get_index(
                                 string, first[2]) + first[-1][0] + 1
                             m.end = get_index(string,
                                               last[2]) + last[-1][1] + 1
                             m.matched = string[m.begin:m.end]
                             m.tokstream = TokenStream(stream)
                             m.tokpos = i
                             return m
                     break
                 if FIN in selection:
                     K = j
             tracer = TokenTracer(self.langlet, self.symbol)
         i += 1
Exemplo n.º 15
0
nfa_E = [
    "E: a '+' E | a '*' E | 'a'", (E, 0, E), {
        (E, 0, E): [(a, 1, E), (a, 2, E), (a, 3, E)],
        (a, 1, E): [(PLUS, 4, E)],
        (a, 2, E): [(MUL, 5, E)],
        (PLUS, 4, E): [(E, 6, E)],
        (MUL, 5, E): [(E, 7, E)],
        (a, 3, E): [(FIN, FEX, E)],
        (E, 6, E): [(FIN, FEX, E)],
        (E, 7, E): [(FIN, FEX, E)]
    }
]

nfa_E2 = [
    "E: a E", (E, 0, E), {
        (E, 0, E): [(a, 1, E)],
        (a, 1, E): [(E, 2, E)],
        (E, 2, E): [(FIN, FEX, E)]
    }
]

langlet.parse_nfa.nfas[E] = nfa_E2

tt = TokenTracer(langlet)
tt.selectables()
tt.select(1)
tt.select(1)
tt.select(1)
tt.select(1)
tt.select(1)
Exemplo n.º 16
0
langlet.parse_nfa.start_symbols = (E,)


nfa_E = ["E: a '+' E | a '*' E | 'a'",
    (E,0,E),
    {(E,0,E):[(a,1,E),(a,2,E),(a,3,E)],
     (a,1,E):[(PLUS,4,E)],
     (a,2,E):[(MUL,5,E)],
     (PLUS,4,E):[(E,6,E)],
     (MUL,5,E):[(E,7,E)],
     (a,3,E):[(FIN, FEX, E)],
     (E,6,E):[(FIN, FEX,E)],
     (E,7,E):[(FIN, FEX,E)]}]

nfa_E2 = ["E: a E",
    (E,0,E),
    {(E,0,E):[(a,1,E)],
     (a,1,E):[(E,2,E)],
     (E,2,E):[(FIN, FEX,E)]}]

langlet.parse_nfa.nfas[E] = nfa_E2

tt = TokenTracer(langlet)
tt.selectables()
tt.select(1)
tt.select(1)
tt.select(1)
tt.select(1)
tt.select(1)

Exemplo n.º 17
0
def random_rule(stoplen):
    # some rules to constrain 'interesting' cases
    #
    # 1. Avoid double parens (( ... )) or double square braces [[ ... ]]
    # 2. Avoid use of STRING
    # 3. Avoid sequences of NAME longer than 2 i.e. NAME NAME NAME
    trace = []
    ttracer = TokenTracer(ls_grammar, start = ls_grammar.symbol.rhs)
    STRING = ls_grammar.token.STRING
    NAME   = ls_grammar.token.NAME
    LPAR   = ls_grammar.token.LPAR
    RPAR   = ls_grammar.token.RPAR
    LSQB   = ls_grammar.token.LSQB
    RSQB   = ls_grammar.token.RSQB

    selection = list(ttracer.selectables())

    while True:
        # print len(trace), selection
        if len(trace)>stoplen:
            if None in selection:
                return trace
            elif RSQB in selection:
                trace.append(RSQB)
                selection = ttracer.select(RSQB)
                continue
            elif RPAR in selection:
                trace.append(RPAR)
                selection = ttracer.select(RPAR)
                continue
        while selection:
            k = random.randrange(len(selection))
            item = selection[k]
            selection.remove(item)
            if item is None:
                continue
            if item == STRING:
                continue
            elif item in (NAME, LPAR, LSQB):
                if len(trace)>=2:
                    if trace[-1] == trace[-2] == item:
                        continue
                    if item in (LPAR, LSQB):
                        if trace[-2] in (LPAR, LSQB) and trace[-1] in (LPAR, LSQB):
                            continue

            elif item in (RSQB, RPAR):
                if trace and trace[-1] == item:
                    if item == RSQB:
                        LEFT = LSQB
                    else:
                        LEFT = LPAR
                    RIGHT = item
                    m = len(trace)-2
                    double = False
                    level = -2
                    while m:
                        if trace[m] == RIGHT:
                            level-=1
                        elif trace[m] == LEFT:
                            level+=1
                        if level == 0:
                            if trace[m+1] == LEFT:
                                double = True
                            break
                        m-=1
                    if double:
                        continue
            trace.append(item)
            selection = list(ttracer.select(item))
            break
Exemplo n.º 18
0
                    S.append(c)
                else:
                    c = chars[random.randrange(0, len(chars))]
                    S.append(c)
                selection = list(tracer.select(t))
                break

            if len(S) >= self.stdlen:
                if FIN in selection:
                    if S[0] in ('"', "'"):
                        if len(S) >= 4:
                            return ''.join(S)
                    if S[0] in string.digits:
                        if len(S) >= 4:
                            return ''.join(S)
                    if random.randrange(0, 2) == 0:
                        return ''.join(S)


if __name__ == '__main__':
    import langscape
    from langscape.trail.tokentracer import TokenTracer
    python = langscape.load_langlet("python")
    tracer = TokenTracer(python, python.lex_symbol.Single3, "lex")

    tokgen = TokenGenerator(python)
    for i in range(100):
        s = tokgen.gen_token_string(python.lex_symbol.NAME)
        print s
        #print "NUM", "%-8s %s"%(s, eval(s))