Exemplo n.º 1
0
def tokenize_errored_file(file, file_orig, error):
    spaces, tokens = jlu.tokenize_with_white_space(jlu.open_file(file))
    token_started = False
    from_token = -1
    to_token = -1
    count = 0
    tokens_errored = []
    n_lines = 5
    for token, space in zip(tokens, spaces):
        if not token_started and int(error['line']) == token.position[0]:
            token_started = True
            tokens_errored.append(f'<{error["type"]}>')
            from_token = count
        if token_started and int(error['line']) < token.position[0]:
            token_started = False
            tokens_errored.append(f'</{error["type"]}>')
            to_token = count
        if token.position[0] >= int(
                error['line']) - n_lines and token.position[0] <= int(
                    error['line']) + n_lines:
            tokens_errored.append(get_token_value(token))
            tokens_errored.append(get_space_value(space))
        count += 1
    if from_token == -1:
        tokens_errored.append(f'<{error["type"]}>')
        tokens_errored.append(f'</{error["type"]}>')

    spaces, tokens = jlu.tokenize_with_white_space(jlu.open_file(file_orig))
    tokens_correct = []
    for token, space in zip(tokens[from_token:to_token],
                            spaces[from_token:to_token]):
        tokens_correct.append(get_token_value(token))
        tokens_correct.append(get_space_value(space))
    return tokens_errored, tokens_correct
Exemplo n.º 2
0
def tokenize_errored_file_model2(file, file_orig, error):

    # else:
    #     for token, space in zip(tokens[start:end], spaces[start:end]):
    #         tokens_errored.append(get_token_value(token))
    #         tokens_errored.append(get_space_value(space))
    #     tokens_errored.append(f'<{error["type"]}>')
    #     tokens_errored.append(f'</{error["type"]}>')

    tokens_errored, info = tokenize_file_to_repair(file, error)

    tokens_errored_in_tag = info['tokens_errored_in_tag']
    from_token = info['from_token']
    to_token = info['to_token']

    spaces, tokens = jlu.tokenize_with_white_space(jlu.open_file(file_orig))
    tokens_correct = []

    for token, space in zip(tokens[from_token:to_token],
                            spaces[from_token:to_token]):
        tokens_correct.append(get_token_value(token))
        tokens_correct.append(get_space_value(space))

    if len(tokens_errored_in_tag) != len(tokens_correct):
        print("WHAAAAATT")
    info['count_diff'] = 0
    for t_A, t_B in zip(tokens_errored_in_tag, tokens_correct):
        if t_A != t_B:
            info['count_diff'] += 1

    return tokens_errored, tokens_correct, tokens_errored_in_tag, info
Exemplo n.º 3
0
def vectorize_file(path, vectorizer):
    spaces, tokens = jlu.tokenize_with_white_space(jlu.open_file(path))

    result = []
    for ws, t in zip(spaces, tokens):
        result.append(vectorizer(ws, t))

    return result
Exemplo n.º 4
0
def build_vocabulary(files):
    count = {}
    tokenized_files = [
        jlu.tokenize_with_white_space(jlu.open_file(path)) for path in files
    ]
    whitespace_id = set()

    threshold = 30

    for spaces, tokens in tokenized_files:
        whitespace_id = set(spaces) | whitespace_id
        for token in tokens:
            name = get_token_value(token)
            if not name in count:
                count[name] = 0
            count[name] += 1

    litterals = list(filter(lambda key: count[key] >= threshold, count.keys()))
    litterals = {
        key: value
        for key, value in zip(litterals, range(len(litterals)))
    }

    whitespace_id = {
        key: value
        for key, value in zip(whitespace_id, range(len(whitespace_id)))
    }

    len_litterals = len(litterals)
    len_whitespace = len(whitespace_id)
    vec_size = len_litterals + 1 + len_whitespace

    def get_vector(space, token):
        vector = np.array([0] * vec_size)
        if get_token_value(token) in litterals:
            vector[litterals[get_token_value(token)]] = 1
        else:
            vector[len_litterals] = 1
        vector[len_litterals + 1 + whitespace_id[space]] = 1
        return vector

    print(litterals.keys())

    return get_vector, whitespace_id
Exemplo n.º 5
0
def tokenize_file_to_repair(file_path, error):
    spaces, tokens = jlu.tokenize_with_white_space(jlu.open_file(file_path))

    info = {}

    token_started = False
    token_line_start = -1
    token_line_end = -1
    count = 0

    tokens_errored = []
    n_lines = 6

    start = len(tokens)
    end = 0

    from_token = 0
    to_token = 0

    for token, space in zip(tokens, spaces):
        if token.position[0] >= int(
                error['line']) - n_lines and token.position[0] <= int(
                    error['line']) + n_lines:
            start = min(count, start)
            end = max(count, end)
        if not token_started and int(error['line']) == token.position[0]:
            token_started = True
            token_line_start = count
        if token_started and int(error['line']) < token.position[0]:
            token_started = False
            token_line_end = count
        count += 1
    start = max(0, start - 2)
    end = min(len(tokens), end + 2)
    if token_line_end == -1:
        token_line_end = token_line_start

    # print(error)

    if 'column' in error and error['type'] != 'OneStatementPerLine':
        errored_token_index = -1
        around = 10
        for token, index in zip(tokens, range(len(tokens))):
            if token.position[0] <= int(
                    error['line']) and token.position[1] <= int(
                        error['column']):
                errored_token_index = index
        from_token = max(0, errored_token_index - around)
        to_token = min(len(tokens), errored_token_index + 1 + around)
    else:
        around = 2
        around_after = 13
        errored_token_index = -1
        if token_line_start != -1:
            from_token = max(start, token_line_start - around)
            to_token = min(end, token_line_end + around_after + 1)
        else:
            errored_token_index = -1
            around = 2
            around_after = 18
            for token, index in zip(tokens, range(len(tokens))):
                if token.position[0] < int(error['line']):
                    errored_token_index = index
            from_token = max(0, errored_token_index - around)
            to_token = min(len(tokens), errored_token_index + 1 + around_after)
    tokens_errored_in_tag = []
    for token, space in zip(tokens[from_token:to_token],
                            spaces[from_token:to_token]):
        tokens_errored_in_tag.append(get_token_value(token))
        tokens_errored_in_tag.append(get_space_value(space))

    for token, space in zip(tokens[start:from_token],
                            spaces[start:from_token]):
        tokens_errored.append(get_token_value(token))
        tokens_errored.append(get_space_value(space))
    tokens_errored.append(f'<{error["type"]}>')
    for token, space in zip(tokens[from_token:to_token],
                            spaces[from_token:to_token]):
        tokens_errored.append(get_token_value(token))
        tokens_errored.append(get_space_value(space))
    tokens_errored.append(f'</{error["type"]}>')
    for token, space in zip(tokens[to_token:end], spaces[to_token:end]):
        tokens_errored.append(get_token_value(token))
        tokens_errored.append(get_space_value(space))

    info['from_token'] = from_token
    info['to_token'] = to_token
    info['start'] = start
    info['end'] = end
    info['error'] = error
    info['tokens_errored_in_tag'] = tokens_errored_in_tag

    return tokens_errored, info