def string2tokens(string): string = ''.join([char if str.isalpha(char) else ' ' for char in string]) string = SPACE_SPLITTER.sub(" ", string) tokens = string.split() tokens = [split_identifier(tok) for tok in tokens] tokens = list(itertools.chain(*tokens)) tokens = [str.lower(tok) for tok in tokens] # tokens = vocab.encode(string, out_type=str) # tokens = str.replace(' '.join(tokens), SPM_SPACE, '') return tokens
def parse_docstring_tokens(self, docstring_tokens): # parse comment from docstring_tokens docstring_tokens = [''.join([char for char in token if char not in MEANINGLESS_TOKENS]) \ for token in docstring_tokens] docstring_tokens = itertools.chain(*[ split_identifier(token, str_flag=False) for token in docstring_tokens ]) docstring_tokens = util.stress_tokens(docstring_tokens) if self.to_lower: docstring_tokens = util.lower(docstring_tokens) return docstring_tokens
def parse_docstring(self, docstring): '''parse comment from docstring''' docstring = re.sub(r'\{\@\S+', '', docstring) docstring = re.sub(r'{.+}', '', docstring) docstring = ''.join( [char for char in docstring if char not in MEANINGLESS_TOKENS]) docstring = [ split_identifier(token, str_flag=False) for token in docstring.split(' ') ] docstring = list(itertools.chain(*docstring)) docstring = util.stress_tokens(docstring) if self.to_lower: docstring = util.lower(docstring) return docstring
def pad_leaf_node(ast_tree: Dict, max_len: int, PAD_TOKEN=PAD) -> Dict: ''' pad leaf node's child into [XX, [XX, ...]] split token and pad it with PAD_TOKEN till reach MAX_TOKEN_LIST_LEN e.g. VariableName -> [VariableName, [Variable, Name, PAD_TOKEN, PAD_TOKEN, ...]] ''' for idx, node in ast_tree.items(): if len(node['children']) == 1 and isinstance(node['children'][0], str): subtokens = split_identifier(node['children'][0], False) if len(subtokens) == 0: subtokens = [node['children'][0]] if len(subtokens) >= max_len: subtokens = subtokens[:max_len] else: subtokens.extend([PAD_TOKEN] * (max_len - len(subtokens))) node['children'].append(subtokens) return ast_tree