예제 #1
0
 def string2tokens(string):
     string = ''.join([char if str.isalpha(char) else ' ' for char in string])
     string = SPACE_SPLITTER.sub(" ", string)
     tokens = string.split()
     tokens = [split_identifier(tok) for tok in tokens]
     tokens = list(itertools.chain(*tokens))
     tokens = [str.lower(tok) for tok in tokens]
     # tokens = vocab.encode(string, out_type=str)
     # tokens = str.replace(' '.join(tokens), SPM_SPACE, '')
     return tokens
예제 #2
0
 def parse_docstring_tokens(self, docstring_tokens):
     # parse comment from docstring_tokens
     docstring_tokens = [''.join([char for char in token if char not in MEANINGLESS_TOKENS]) \
                         for token in docstring_tokens]
     docstring_tokens = itertools.chain(*[
         split_identifier(token, str_flag=False)
         for token in docstring_tokens
     ])
     docstring_tokens = util.stress_tokens(docstring_tokens)
     if self.to_lower:
         docstring_tokens = util.lower(docstring_tokens)
     return docstring_tokens
예제 #3
0
 def parse_docstring(self, docstring):
     '''parse comment from docstring'''
     docstring = re.sub(r'\{\@\S+', '', docstring)
     docstring = re.sub(r'{.+}', '', docstring)
     docstring = ''.join(
         [char for char in docstring if char not in MEANINGLESS_TOKENS])
     docstring = [
         split_identifier(token, str_flag=False)
         for token in docstring.split(' ')
     ]
     docstring = list(itertools.chain(*docstring))
     docstring = util.stress_tokens(docstring)
     if self.to_lower:
         docstring = util.lower(docstring)
     return docstring
예제 #4
0
def pad_leaf_node(ast_tree: Dict, max_len: int, PAD_TOKEN=PAD) -> Dict:
    '''
    pad leaf node's child into [XX, [XX, ...]]
    split token and pad it with PAD_TOKEN till reach MAX_TOKEN_LIST_LEN
    e.g. VariableName ->  [VariableName, [Variable, Name, PAD_TOKEN, PAD_TOKEN, ...]]
    '''
    for idx, node in ast_tree.items():
        if len(node['children']) == 1 and isinstance(node['children'][0], str):
            subtokens = split_identifier(node['children'][0], False)
            if len(subtokens) == 0:
                subtokens = [node['children'][0]]
            if len(subtokens) >= max_len:
                subtokens = subtokens[:max_len]
            else:
                subtokens.extend([PAD_TOKEN] * (max_len - len(subtokens)))
            node['children'].append(subtokens)
    return ast_tree