def mix_sources(source_A, source_B, from_line, to_line=-1): """Put a little bit of B into A """ if to_line == -1: to_line = from_line file_A_lines = [line + '\n' for line in source_A.split('\n')] file_B_lines = [line + '\n' for line in source_B.split('\n')] tokens_A = tokenizer.tokenize(source_A) tokens_B = tokenizer.tokenize(source_B) tokens = zip(tokens_A, tokens_B) lines = range(from_line, to_line) output_source = "" first_part = ''.join(file_A_lines[:(from_line - 1)]) output_source += first_part from_token = None first_token_of_A = None to_token = None last_token_of_A = None for token_A, token_B in tokens: if token_A.position[0] >= from_line and token_A.position[0] <= to_line: if 'form_token' not in locals(): form_token = token_B first_token_of_A = token_A to_token = token_B last_token_of_A = token_A # print(first_token_of_A,last_token_of_A) if last_token_of_A: if first_token_of_A.position[0] != from_line: output_source += ''.join( file_A_lines[(from_line - 1):(first_token_of_A.position[0] - 1)]) output_source += " " * (first_token_of_A.position[1] - 1) output_source += source_B[( len(''.join(file_B_lines[:(form_token.position[0] - 1)])) + form_token.position[1] - 1):(len(''.join(file_B_lines[:(to_token.position[0] - 1)])) + to_token.position[1] + len(to_token.value) - 1)] output_source += '\n' if last_token_of_A.position[0] != to_line: output_source += ''.join( file_A_lines[(last_token_of_A.position[0]):(to_line)]) output_source += ''.join(file_A_lines[(to_line):]) else: output_source += ''.join(file_A_lines[(from_line - 1):]) return output_source
def tokenize_java_code(code): byte_str = io.BytesIO(code).read() # assume code is a `BytesIO` object string_obj = byte_str.decode('utf-8') # Convert to a unicode object tokens = list(tokenizer.tokenize(string_obj)) tokens = [token for t in tokens for token in t.value.split(" ")] return tokens
def count_max_identifier_occurences(snippet): ''' Calculates the maximum occurences of any identifier in any line in snippet. ''' top_freq_perline = [ ] # list with highest identifier frequency in each line for line in snippet: try: line_tokens = list(tokenizer.tokenize(line)) line_identifiers = [ token.value for token in line_tokens if type(token) == JAVA_IDENTIFIER ] identifier_freq = Counter(line_identifiers) if identifier_freq: # avoid lines without any identifiers top_identifier_freq = identifier_freq.most_common(1)[0][1] else: top_identifier_freq = 0 top_freq_perline.append(top_identifier_freq) except Exception as err: top_freq_perline = [] break if top_freq_perline: max_identifier_freq = max(top_freq_perline) else: max_identifier_freq = 0 return {'max_identifier_occurences': max_identifier_freq}
def count_identifiers(snippet): ''' Calculates the average per line and maximum in any line identifiers in snippet. ''' perline_identifiers = [] for line in snippet: try: line_tokens = list(tokenizer.tokenize(line)) perline_identifiers.append([ token.value for token in line_tokens if type(token) == JAVA_IDENTIFIER ]) except Exception as err: perline_identifiers = [] break # maximum number of identifiers in any line if perline_identifiers: max_identifiers_perline = len(max(perline_identifiers, key=len)) total_identifiers = sum([len(l) for l in perline_identifiers]) avg_identifiers_perline = total_identifiers / len(perline_identifiers) else: max_identifiers_perline = 0 avg_identifiers_perline = 0 return { 'max_identifiers_perline': max_identifiers_perline, 'avg_identifiers_perline': avg_identifiers_perline, }
def tokenize_and_abstract( self, source_code): """As per the superclass.""" try: java_tokens = tokenizer.tokenize(source_code) except tokenizer.LexerError as e: logging.warn('The tokenizer raised exception `%s` while parsing %s', e, source_code) return ( (cubert_tokenizer.quote_special( unified_tokenizer.TokenKind.ERROR.name), unified_tokenizer.TokenKind.ERROR), (cubert_tokenizer.quote_special(unified_tokenizer.TokenKind.EOS), unified_tokenizer.TokenKind.EOS), ) agnostic_tokens: List[Tuple[str, unified_tokenizer.TokenKind]] = [] for token in java_tokens: # The token kind is the subclass type of the token. token_type = type(token) if token_type not in JavaTokenizer._TOKEN_TYPE_MAP: raise ValueError('Received Java token type %s, but it was unexpected, ' 'while tokenizing \n%s\n' % (token_type, source_code)) agnostic_tokens.append( (token.value, JavaTokenizer._TOKEN_TYPE_MAP[token_type])) return agnostic_tokens
def count_identifier_length(snippet): ''' Calculates the average and maximum identifier length in snippet. ''' perline_identifiers = [] total_lines = 0 for line in snippet: total_lines += 1 try: line_tokens = list(tokenizer.tokenize(line)) perline_identifiers.append([ token.value for token in line_tokens if type(token) == JAVA_IDENTIFIER ]) except Exception as err: perline_identifiers = [] break # concatenate sublists of identifiers into one list. total_identifiers = list(chain.from_iterable(perline_identifiers)) if total_identifiers: max_identifier_length = len(max(total_identifiers, key=len)) avg_identifier_length = sum(map( len, total_identifiers)) / len(total_identifiers) else: max_identifier_length = 0 avg_identifier_length = 0 return { 'max_identifier_length': max_identifier_length, 'avg_identifier_length': avg_identifier_length }
def count_numbers(snippet): ''' Calculates the average per line and maximum in any line numbers in snippet. ''' perline_numbers = [] for line in snippet: try: line_tokens = list(tokenizer.tokenize(line)) perline_numbers.append([ token.value for token in line_tokens if type(token) in JAVA_NUMBER ]) except Exception as err: perline_numbers = [] break if perline_numbers: # maximum number of number variables in any line max_numbers_perline = len(max(perline_numbers, key=len)) # total number variables in snippet total_numbers = sum([len(l) for l in perline_numbers]) # average numbers per line in snippet avg_numbers_perline = total_numbers / len(perline_numbers) else: max_numbers_perline = 0 avg_numbers_perline = 0 return { 'max_numbers_perline': max_numbers_perline, 'avg_numbers_perline': avg_numbers_perline }
def count_comparison_operators(snippet): ''' Calculates the average per line comparison operators in snippet. ''' total_lines = 0 total_operators = 0 for line in snippet: total_lines += 1 try: line_tokens = list(tokenizer.tokenize(line)) total_operators += sum([ 1 for token in line_tokens if token.value in JAVA_COMPARISON_OPERATORS ]) except Exception as err: total_operators = 0 break if total_lines > 0: avg_comparison_operators_perline = total_operators / total_lines else: avg_comparison_operators_perline = 0 return { 'avg_comparison_operators_perline': avg_comparison_operators_perline }
def count_keywords(snippet): ''' Calculates the average per line and maximum in any line keywords in snippet. ''' perline_keywords = [] for line in snippet: try: line_tokens = list(tokenizer.tokenize(line)) perline_keywords.append([ token.value for token in line_tokens if type(token) == JAVA_KEYWORD ]) except Exception as err: perline_keywords = [] break if perline_keywords: # maximum number of keywords in any line max_keywords_perline = len(max(perline_keywords, key=len)) # total keywords in snippet total_keywords = sum([len(l) for l in perline_keywords]) # average keywords per line in snippet avg_keywords_perline = total_keywords / len(perline_keywords) else: max_keywords_perline = 0 avg_keywords_perline = 0 return { 'max_keywords_perline': max_keywords_perline, 'avg_keywords_perline': avg_keywords_perline }
def parse_member_signature(sig): if not sig.endswith(';'): sig = sig + ';' tokens = tokenize(sig) parser = Parser(tokens) return parser.parse_member_declaration()
def parse_expression(exp): if not exp.endswith(';'): exp = exp + ';' tokens = tokenize(exp) parser = Parser(tokens) return parser.parse_expression()
def parse_type_signature(sig): if sig.endswith(';'): sig = sig[:-1] sig = sig + '{ }' tokens = tokenize(sig) parser = Parser(tokens) return parser.parse_class_or_interface_declaration()
def tokenize(code): """Tokenizes a given source code Args: tokens: list of string tokens """ if code not in TOKENIZER_CACHE: TOKENIZER_CACHE[code] = map(lambda t: t.value, list(tokenizer.tokenize(code))) return TOKENIZER_CACHE[code]
def tokenize_code(self, code_snippet, identifier, verbose=0): code = self.parse_code(code_snippet, identifier) if code == ERROR_MESSAGE or code == EMPTY_MESSAGE: return [] try: return [t.value for t in tokenizer.tokenize(code)] except Exception as e: if verbose == 1: print('\n'.join([code, e])) return []
def parse_constructor_signature(sig): # Add an empty body to the signature, replacing a ; if necessary if sig.endswith(';'): sig = sig[:-1] sig = sig + '{ }' tokens = tokenize(sig) parser = Parser(tokens) return parser.parse_member_declaration()
def tokenize_and_abstract(self, source_code): """As per the superclass.""" agnostic_tokens: List[unified_tokenizer.AbstractToken] = [] try: java_tokens = tokenizer.tokenize(source_code) for token in java_tokens: # The token kind is the subclass type of the token. token_type = type(token) if token_type not in JavaTokenizer._TOKEN_TYPE_MAP: raise ValueError( 'Received Java token type %s, but it was unexpected, ' 'while tokenizing \n%s\n' % (token_type, source_code)) # The tokenizer seems to take some liberties with Unicode, returning # invalid characters. This cleans spellings up. spelling = token.value.encode('utf-8', errors='replace').decode('utf-8') agnostic_tokens.append( unified_tokenizer.AbstractToken( spelling, JavaTokenizer._TOKEN_TYPE_MAP[token_type], unified_tokenizer. TokenMetadata(start=unified_tokenizer.Position( # JavaTokenizer counts lines and columns from 1. line=token.position.line - 1, column=token.position.column - 1)))) except (tokenizer.LexerError, TypeError) as e: # Sometimes, javalang returns a TypeError when reading a number. # See # https://github.com/c2nes/javalang/blob/0664afb7f4d40254312693f2e833c1ed4ac551c7/javalang/tokenizer.py#L370 logging.warn( 'The tokenizer raised exception `%r` while parsing %s', e, source_code) agnostic_tokens.append( unified_tokenizer.AbstractToken( cubert_tokenizer.quote_special( unified_tokenizer.TokenKind.ERROR.name), unified_tokenizer.TokenKind.ERROR, unified_tokenizer.TokenMetadata())) # javalang doesn't seem to ever return `EndOfinput` despite there being a # token type for it. We insert it here. agnostic_tokens.append( unified_tokenizer.AbstractToken( cubert_tokenizer.quote_special( unified_tokenizer.TokenKind.EOS.name), unified_tokenizer.TokenKind.EOS, unified_tokenizer.TokenMetadata())) return agnostic_tokens
def tokenize_with_white_space(file_content, relative=True, new_line_at_the_end_of_file=True): """ Tokenize the java source code :param file_content: the java source code :return: (whitespace, tokens) """ position_last_line = 1 tokens = tokenizer.tokenize(file_content, parse_comments=True) tokens = [t for t in tokens] whitespace = list() for index in range(0, len(tokens) - 1): tokens_position = tokens[index].position next_token_position = tokens[index + 1].position end_of_token = (tokens_position[0], tokens_position[1] + len(tokens[index].value)) if end_of_token == next_token_position: whitespace.append((0, 0)) else: if (end_of_token[0] == next_token_position[0]): # same line whitespace.append( (0, next_token_position[1] - end_of_token[1])) else: # new line if relative: whitespace.append( (next_token_position[0] - end_of_token[0] - tokens[index].value.count('\n'), next_token_position[1] - position_last_line)) position_last_line = next_token_position[1] else: whitespace.append( (next_token_position[0] - end_of_token[0] - tokens[index].value.count('\n'), next_token_position[1])) if new_line_at_the_end_of_file: whitespace.append((1, 0)) else: if file_content[-1] == '\n': if file_content[-2] == '\n': whitespace.append((2, 0)) else: whitespace.append((1, 0)) else: whitespace.append((0, 0)) # rewritten = reformat(whitespace, tokens) # print(rewritten) # return rewritten return whitespace, tokens
def get_tokens(path): toks = [] try: with open(path, 'r') as f: for tok in tokenizer.tokenize(f.read()): toks.append(tok.value) except Exception as e: pass seq = [] while True: try: tok = toks.pop(0) if tok == '.': try: nxt = toks.pop(0) r = '.' + nxt if seq: if seq[-1] not in (',','('): seq[-1] += r else: seq.append(r) else: seq.append(r) except IndexError: seq.append(tok) elif tok == ',': try: nxt = toks.pop(0) if nxt in ('}', ';'): seq.append(nxt) else: seq.append(tok) seq.append(nxt) except IndexError: seq.append(tok) else: seq.append(tok) except IndexError: break return seq
def process_source_code(tup): """Get number of tokens and number of lines""" code, fname = tup try: tokens = list(tokenize(code)) except LexerError as e: return Data(bad_syntax=1) except Exception as e: return Data(library_errors=1) if tokens: return Data(tokens=len(tokens), lines=tokens[-1].position.line) else: return Data(empty=1)
def count_commas(snippet): ''' Calculates the average per line periods (,) in snippet. ''' total_lines = 0 total_commas = 0 for line in snippet: total_lines += 1 try: line_tokens = list(tokenizer.tokenize(line)) total_commas += sum( [1 for token in line_tokens if token.value == ","]) except Exception as err: total_commas = 0 break if total_lines > 0: avg_commas_perline = total_commas / total_lines else: avg_commas_perline = 0 return {'avg_commas_perline': avg_commas_perline}
def count_assignments(snippet): ''' Calculates the average per line assignments (=) in snippet. ''' total_lines = 0 total_assignments = 0 for line in snippet: total_lines += 1 try: line_tokens = list(tokenizer.tokenize(line)) total_assignments += sum( [1 for token in line_tokens if token.value == "="]) except Exception as err: total_assignments = 0 break if total_lines > 0: avg_assignments_perline = total_assignments / total_lines else: avg_assignments_perline = 0 return {'avg_assignments_perline': avg_assignments_perline}
def count_parenthesis(snippet): ''' Calculates the average per line parenthesis ((, )) in snippet. ''' total_parenthesis = 0 total_lines = 0 for line in snippet: total_lines += 1 try: line_tokens = list(tokenizer.tokenize(line)) total_parenthesis += sum([ 1 for token in line_tokens if token.value == "(" or token.value == ")" ]) except Exception as err: total_parenthesis = 0 break if total_lines > 0: avg_parenthesis_perline = total_parenthesis / total_lines else: avg_parenthesis_perline = 0 return {'avg_parenthesis_perline': avg_parenthesis_perline}
def count_branches(snippet): ''' Calculates the average per line branches (if/switch) in snippet. ''' total_branches = 0 total_lines = 0 for line in snippet: total_lines += 1 try: line_tokens = list(tokenizer.tokenize(line)) total_branches += sum([ 1 for token in line_tokens if token.value == "if" or token.value == "switch" ]) except Exception as err: total_branches = 0 break if total_lines > 0: avg_branches_perline = total_branches / total_lines else: avg_branches_perline = 0 return {'avg_branches_perline': avg_branches_perline}
def count_loops(snippet): ''' Calculates the average per line loops (for/while) in snippet. ''' total_loops = 0 total_lines = 0 for line in snippet: total_lines += 1 try: line_tokens = list(tokenizer.tokenize(line)) total_loops += sum([ 1 for token in line_tokens if token.value == "for" or token.value == "while" ]) except Exception as err: total_loops = 0 break if total_lines > 0: avg_loops_perline = total_loops / total_lines else: avg_loops_perline = 0 return {'avg_loops_perline': avg_loops_perline}
def tokenize_with_white_space(file_content, relative=True): """ Tokenize the java source code :param file_content: the java source code :return: (whitespace, tokens) """ indentation_last_line = 1 file_content_lines = file_content.split('\n') javalang_tokens = javalang_tokenizer.tokenize(file_content, parse_comments=True) tokens = [] count = 0 try: for t in javalang_tokens: count += 1 if count > 1000000: break tokens.append(t) pass except Exception as err: print('Something wrong happened while tokenizing the following content: ' + file_content) return None, None whitespace = list() for index in range(0, len(tokens)-1): tokens_position = tokens[index].position next_token_position = tokens[index+1].position end_of_token = (tokens_position[0], tokens_position[1] + len(tokens[index].value)) if end_of_token == next_token_position: whitespace.append((0,0,'None')) else: if end_of_token[0] == next_token_position[0]: # same line if file_content_lines[tokens_position[0]-1] is not '': if len(file_content_lines[tokens_position[0]-1]) > end_of_token[1] and file_content_lines[tokens_position[0]-1][end_of_token[1]] == '\t': space_type = 'TB' else: space_type = 'SP' else: space_type = 'None' whitespace.append(( 0, next_token_position[1] - end_of_token[1], space_type)) else: # new line new_line = file_content_lines[next_token_position[0]-1] if new_line is not '': if new_line[get_line_indent(new_line) - 1] == '\t': space_type = 'TB' else: space_type = 'SP' else: space_type = 'None' if relative: spaces = next_token_position[1] - indentation_last_line whitespace.append((next_token_position[0] - end_of_token[0] - tokens[index].value.count('\n'), spaces, space_type)) indentation_last_line = next_token_position[1] else: whitespace.append((next_token_position[0] - end_of_token[0] - tokens[index].value.count('\n'), next_token_position[1] - 1, space_type)) count_line_break = 0 for index in range(len(file_content)-1, 0, -1): if file_content[index] == '\n': count_line_break += 1 elif file_content[index] != ' ' and file_content[index] != '\t': break whitespace.append((count_line_break, 0, 'None')) return whitespace, tokens
def parse(s): tokens = tokenize(s) parser = Parser(tokens) return parser.parse()
def gen_ugly(file_path, output_dir, modification_number=(1, 0, 0, 0, 0)): """ Gen an ugly vertsion of of .java file """ insertions_sample_size_space = modification_number[0] insertions_sample_size_tab = modification_number[1] insertions_sample_size_newline = modification_number[2] insertions_sample_size = insertions_sample_size_space + insertions_sample_size_tab + insertions_sample_size_newline deletions_sample_size_space = modification_number[3] deletions_sample_size_newline = modification_number[4] deletions_sample_size = deletions_sample_size_space + deletions_sample_size_newline # deletions_sample_size = modification_number - insertions_sample_size with open(file_path) as f: file_lines = f.readlines() file_content = "".join(file_lines) tokens = tokenizer.tokenize(file_content) tokens = [t for t in tokens] # print("\n".join([ str(t) for t in tokens])) # Take a sample of locations suitable for insertions insertions_sample = random.sample(tokens, min(insertions_sample_size, len(tokens))) insertions = dict() insertions_chars = ([' '] * insertions_sample_size_space) insertions_chars.extend(['\t'] * insertions_sample_size_tab) insertions_chars.extend(['\n'] * insertions_sample_size_newline) random.shuffle(insertions_chars) for element, char in zip(insertions_sample, insertions_chars): insertions[element.position] = char # Select every locations suitable for deletions (i.e. before or after a separator/operator) deletions_spots = list() suitable_for_deletions = [tokenizer.Separator, tokenizer.Operator] for index in range(0, len(tokens) - 1): if (type(tokens[index]) in suitable_for_deletions): prev_token_position = tokens[index - 1].position tokens_position = tokens[index].position next_token_position = tokens[index + 1].position end_of_prev_token = (prev_token_position[0], prev_token_position[1] + len(tokens[index - 1].value)) end_of_token = (tokens_position[0], tokens_position[1] + len(tokens[index].value)) if (end_of_prev_token != tokens_position): #print("prev : ", tokens[index-1].value , tokens[index].value, tokens[index+1].value, tokens[index].position) deletions_spots.append((end_of_prev_token, tokens_position)) if (end_of_token != next_token_position): #print("next : ", tokens[index-1].value , tokens[index].value, tokens[index+1].value, tokens[index].position) deletions_spots.append((end_of_token, next_token_position)) deletions_spots = list(set(deletions_spots)) # Take a sample of locations suitable for deletions deletions_sample = random.sample( deletions_spots, min(deletions_sample_size, len(deletions_spots))) deletions = dict() for deletion_intervals in deletions_spots: #print(deletion_intervals) from_char = deletion_intervals[0] to_char = deletion_intervals[1] while from_char[0] <= to_char[0]: if from_char[0] == to_char[0]: interval = I.closedopen(from_char[1], to_char[1]) else: interval = I.closedopen(from_char[1], I.inf) if (from_char[0] not in deletions): deletions[from_char[0]] = list() deletions[from_char[0]].append(interval) from_char = (from_char[0] + 1, 0) deletions_spots_chars = dict() line_num = 1 for line in file_lines: char_num = 1 for char in line: if (line_num in deletions): for intervals in deletions[line_num]: if char_num in intervals: if (char not in deletions_spots_chars): deletions_spots_chars[char] = [] deletions_spots_chars[char].append( (line_num, char_num)) char_num = char_num + 1 line_num = line_num + 1 deletions = [] if (' ' in deletions_spots_chars): deletions.extend( random.sample(deletions_spots_chars[' '], deletions_sample_size_space)) if ('\n' in deletions_spots_chars): deletions.extend( random.sample(deletions_spots_chars['\n'], deletions_sample_size_newline)) # print(insertions) # print(deletions) if not os.path.exists(output_dir): os.makedirs(output_dir) output_path = os.path.join(output_dir, f'./{file_path.split("/")[-1]}') # Write the output file with open(output_path, "w") as output_file_object: line_num = 1 for line in file_lines: char_num = 1 for char in line: skip = False if ((line_num, char_num) in deletions): skip = True if ((line_num, char_num) in insertions): output_file_object.write(insertions[(line_num, char_num)]) if (not skip): output_file_object.write(char) char_num = char_num + 1 line_num = line_num + 1 return tuple(set(deletions) | set(insertions.keys()))
def _parser(snippet): return Parser(tokenize(snippet))
def tokenize(s, out, skip_license=False): prev_line = 1 prev_column = 1 had_package = False for tok in tokenizer.tokenize(s): num_newlines = tok.position[0] - prev_line if num_newlines > 0: out.write(NEWLINE_SYMBOL * num_newlines) out.write('\n') prev_column = 1 value = tok.value.strip() prev_line = tok.position[0] + value.count('\n') num_spaces = tok.position[1] - prev_column if num_spaces > 0: out.write(SPACE_SYMBOL * num_spaces) out.write(' ') prev_column = tok.position[1] + len(tok.value) if isinstance(tok, tokenizer.Keyword) and tok.value == 'package': had_package = True if skip_license and isinstance(tok, tokenizer.Comment) and not had_package: prev_line += 1 continue # Split quotes from values if isinstance(tok, tokenizer.String): out.write('" ') value = value[1:-1] if isinstance(tok, tokenizer.Character): out.write("' ") value = value[1:-1] if isinstance(tok, tokenizer.String) or isinstance( tok, tokenizer.Comment): # Join consecutive space symbols or newline symbols into a single word, # so that whitespace is encoded the same between tokens and within # tokens. value = replace_consecutive(value, x=' ', y=SPACE_SYMBOL, y_after=' ') value = replace_consecutive(value, x='\n', y=NEWLINE_SYMBOL, y_after='\n') out.write(value.strip(' ')) out.write(' ') if isinstance(tok, tokenizer.String): out.write('" ') if isinstance(tok, tokenizer.Character): out.write("' ") out.write(EOF_SYMBOL + '\n')
def _tokenize(self, program_string): import javalang.tokenizer as tokenizer tokens = tokenizer.tokenize(program_string) return map(lambda token: (type(token), token.value), tokens), program_string
def tokenize(code): return [token.value for token in tokenizer.tokenize(code)]