Пример #1
0
def get_dialects(data, encoding):
    delims = WRANGLER_DELIMS
    quotechars = get_potential_quotechars(data)
    escapechars = {}

    for delim in delims:
        delim_escapes = set()
        for u, v in pairwise(data):
            if v == delim and is_potential_escapechar(u, encoding):
                delim_escapes.add(u)
        for quotechar in quotechars:
            escapes = set(delim_escapes)
            for u, v in pairwise(data):
                if v == quotechar and is_potential_escapechar(u, encoding):
                    escapes.add(u)
            escapes.add("")
            escapechars[(delim, quotechar)] = escapes

    dialects = []
    for delim in delims:
        for quotechar in quotechars:
            for escapechar in escapechars[(delim, quotechar)]:
                d = Dialect(delim, quotechar, escapechar)
                dialects.append(d)
    return dialects
Пример #2
0
def maybe_has_escapechar(data, encoding, delim, quotechar):
    if not delim in data and not quotechar in data:
        return False
    for u, v in pairwise(data):
        if v in [delim, quotechar] and is_potential_escapechar(u, encoding):
            return True
    return False
Пример #3
0
def get_escapechar_options(data, encoding, delim, quotechar):
    escapes = set()
    for u, v in pairwise(data):
        if not is_potential_escapechar(u, encoding):
            continue
        if v in [delim, quotechar] and not u in [delim, quotechar]:
            escapes.add(u)
    return escapes
Пример #4
0
def break_ties_four(data, dialects):
    # NOTE: We have only observed one case during development where this
    # function was needed. It may need to be revisited in the future if other
    # examples are found.

    equal_delim = len(set([d.delimiter for d in dialects])) == 1
    if not equal_delim:
        return None

    # First, identify dialects that result in the same parsing result.
    equal_dialects = []
    for a, b in pairwise(dialects):
        X = parse_file(data, a)
        Y = parse_file(data, b)
        if X == Y:
            equal_dialects.append((a, b))

    # Try to break the ties in these pairs
    new_dialects = set()
    visited = set()
    for A, B in equal_dialects:
        ans = break_ties_two(data, A, B)
        if not ans is None:
            new_dialects.add(ans)
        visited.add(A)
        visited.add(B)
    for d in dialects:
        if not d in visited:
            new_dialects.add(d)

    dialects = list(new_dialects)

    # Defer to other functions if the number of dialects was reduced
    if len(dialects) == 2:
        return break_ties_two(data, *dialects)
    elif len(dialects) == 3:
        return break_ties_three(data, *dialects)

    return None
Пример #5
0
def get_potential_dialects(data, encoding):
    """
    We consider as escape characters those characters for which 
    is_potential_escapechar() is True and that occur at least once before a 
    quote character or delimiter in the dialect.

    One may wonder if self-escaping is an issue here (i.e. "\\\\", two times 
    backslash). It is not. In a file where a single backslash is desired and 
    escaping with a backslash is used, then it only makes sense to do this in a 
    file where the backslash is already used as an escape character (in which 
    case we include it). If it is never used as escape for the delimiter or 
    quotechar, then it is not necessary to self-escape.
    """
    delims = get_potential_delimiters(data, encoding)
    quotechars = get_potential_quotechars(data)
    escapechars = {}

    for delim, quotechar in itertools.product(delims, quotechars):
        escapechars[(delim, quotechar)] = set([""])

    for u, v in pairwise(data):
        if not is_potential_escapechar(u, encoding):
            continue
        for delim, quotechar in itertools.product(delims, quotechars):
            if v == delim or v == quotechar:
                escapechars[(delim, quotechar)].add(u)

    dialects = []
    for delim in delims:
        for quotechar in quotechars:
            for escapechar in escapechars[(delim, quotechar)]:
                if masked_by_quotechar(data, quotechar, escapechar, delim):
                    continue
                d = Dialect(delim, quotechar, escapechar)
                dialects.append(d)
    return dialects
Пример #6
0
def break_ties_two(data, A, B):
    """
    Break ties between dialects A and B.

    """
    if A.delimiter == B.delimiter and A.escapechar == B.escapechar:
        if A.quotechar == "" or B.quotechar == "":
            d_no = A if A.quotechar == "" else B
            d_yes = B if d_no == A else A

            X = parse_file(data, dialect=d_no)
            Y = parse_file(data, dialect=d_yes)

            if X == Y:
                # quotechar has no effect
                return d_no
            else:
                # quotechar has an effect
                return d_yes
    elif A.quotechar == B.quotechar and A.escapechar == B.escapechar:
        if sorted([A.delimiter, B.delimiter]) == sorted([",", " "]):
            # Artifact due to type detection (comma as radix point)
            if A.delimiter == ",":
                return A
            else:
                return B
        elif A.delimiter == "-" or B.delimiter == "-":
            # Artifact due to type detection (dash as minus sign)
            if A.delimiter == "-":
                return B
            else:
                return A
    elif A.delimiter == B.delimiter and A.quotechar == B.quotechar:
        Dnone, Descape = (A, B) if A.escapechar == "" else (B, A)

        X = parse_file(data, Dnone)
        Y = parse_file(data, Descape)

        # double check shape. Usually if the shape differs the pattern score
        # should have caught it, but if by a freakish occurance it hasn't then
        # we can't break this tie (for now)
        if len(X) != len(Y):
            return None
        for x, y in zip(X, Y):
            if len(x) != len(y):
                return None

        cells_escaped = []
        cells_unescaped = []
        for x, y in zip(X, Y):
            for u, v in zip(x, y):
                if u != v:
                    cells_unescaped.append(u)
                    cells_escaped.append(v)

        # We will break the ties in the following ways:
        #
        # If the escapechar precedes the quotechar an even number of times
        # within each offending cell, then we think it is a functional escape
        # and the escaped version is the correct dialect. Note that if an odd
        # number of escaped quotechars would occur, then the shape of the file
        # will be different if it is ignored. Only if it occurs an even number
        # of times within the cell can we get the same shape.
        for u in cells_unescaped:
            count = 0
            for a, b in pairwise(u):
                if a != Descape.escapechar:
                    continue
                if a == Descape.escapechar and b == Descape.quotechar:
                    count += 1
            if count > 0 and count % 2 == 0:
                return Descape
            else:
                return Dnone
    return None