Пример #1
0
 def guess_dialect(self, sample):
     sniffer = Sniffer()
     try:
         dialect = sniffer.sniff(sample)
         has_header = sniffer.has_header(sample)
     except Error:  # delimiter failed – maybe there is an empty column: "89.187.1.81,06-05-2016,,CZ,botnet drone"
         has_header = False  # lets just guess the value
         s = sample.split("\n")[
             1]  # we dont take header (there is no empty column for sure)
         delimiter = ""
         for dl in (",", ";",
                    "|"):  # lets suppose the doubled sign is delimiter
             if s.find(dl + dl) > -1:
                 delimiter = dl
                 break
         if not delimiter:  # try find anything that ressembles delimiter
             for dl in (",", ";", "|"):
                 if s.find(dl) > -1:
                     delimiter = dl
                     break
         dialect = csv.unix_dialect
         dialect.delimiter = delimiter
     if not dialect.escapechar:
         dialect.escapechar = '\\'
     # dialect.quoting = 3
     dialect.doublequote = True
     return dialect, has_header
Пример #2
0
 def guess_dialect(self, sample):
     sniffer = Sniffer()
     try:
         dialect = sniffer.sniff(sample)
         has_header = sniffer.has_header(sample)
     except Error:  # delimiter failed – maybe there is an empty column: "89.187.1.81,06-05-2016,,CZ,botnet drone"
         if sample.strip() == "":
             print("The file seems empty")
             quit()
         has_header = False  # lets just guess the value
         try:
             s = sample.split("\n")[1]  # we dont take header (there is no empty column for sure)
         except IndexError:  # there is a single line in the file
             s = sample.split("\n")[0]
         delimiter = ""
         for dl in (",", ";", "|"):  # lets suppose the doubled sign is delimiter
             if s.find(dl + dl) > -1:
                 delimiter = dl
                 break
         if not delimiter:  # try find anything that ressembles delimiter
             for dl in (",", ";", "|"):
                 if s.find(dl) > -1:
                     delimiter = dl
                     break
         dialect = csv.unix_dialect
         dialect.delimiter = delimiter
     if not dialect.escapechar:
         dialect.escapechar = '\\'
     # dialect.quoting = 3
     dialect.doublequote = True
     return dialect, has_header
Пример #3
0
 def __init__(self, inFile):
     from csv import Sniffer, reader
     csvFile = open(inFile, 'r')
     sample = csvFile.read(1024)
     csvFile.seek(0)
     self.reader = reader(csvFile, Sniffer.sniff(sample))
     if Sniffer.has_header(sample):
         self.varNames = next(self.reader)
     else:
         self.varNames = None
 def __init__(self, inFile):
     from csv import Sniffer, reader
     csvFile = open(inFile, 'r')
     sample = csvFile.read(1024)
     csvFile.seek(0)
     self.reader = reader(csvFile, Sniffer.sniff(sample))
     if Sniffer.has_header(sample):
         self.varNames = next(self.reader)
     else:
         self.varNames = None
Пример #5
0
def converter():
    if not session['extension'] or not session['content_type']:
        flash('Unsupported file type', 'info')
        return redirect(url_for('.dropzone'))

    session['processed'] = False
    session['outputs'] = mkdtemp()

    is_csv = (session['extension'] == '.csv'
              or session['content_type'].startswith('text/csv'))
    is_excel = session['extension'] in ['.xls', '.xlsx'] or any(
        s in session['content_type']
        for s in ['spreadsheet', 'xls', 'xlsx', 'excel'])
    is_text = (session['extension'] == '.txt'
               or session['content_type'].startswith('text/'))

    if is_csv:
        file_name = listdir(session['tmp_dir'])[0]
        file_path = join(session['tmp_dir'], file_name)

        # guess file encoding
        encoding = get_encoding(file_path)

        # guess separator
        with open(file_path, encoding=encoding) as f:
            sniffer = Sniffer()
            line = f.readline().encode(encoding).decode('utf-8')
            dialect = sniffer.sniff(line)

        df = pd.read_csv(file_path, encoding=encoding, dialect=dialect)

        session['fields'] = df.columns.tolist()

    elif is_excel:
        file_name = listdir(session['tmp_dir'])[0]
        file_path = join(session['tmp_dir'], file_name)

        df = pd.read_excel(file_path, encoding='utf-8')
        session['fields'] = df.columns.tolist()

    elif is_text:
        session['fields'] = ['id', 'text']
        dataset_json = texts_to_json(session['tmp_dir'])
        df = DataFrame(dataset_json)

    else:
        flash('Uploaded file types have not been recognized')
        return redirect(url_for('.dropzone'))

    df.to_csv(join(session['outputs'], 'original.csv'),
              index=False,
              encoding='utf-8')

    return redirect(url_for('.field_selection_get'))
 def guess_dialect(f):
     # Taken from the Python standard docs, with thanks to Piers Goodhew <*****@*****.**>
     # <https://docs.python.org/2/library/csv.html#csv.Sniffer>
     s = Sniffer()
     try:
         retval = s.sniff(f.read(1024), [',', '\t', ])  # 1024 taken from the Python docs
     except CSVError:
         retval = 'excel'
     finally:
         f.seek(0)  # The above f.read moves the file-cursor in the CSV file.
     return retval
 def guess_dialect(f):
     # Taken from the Python standard docs, with thanks to Piers Goodhew <*****@*****.**>
     # <https://docs.python.org/2/library/csv.html#csv.Sniffer>
     s = Sniffer()
     try:
         retval = s.sniff(f.read(1024), [
             ',',
             '\t',
         ])  # 1024 taken from the Python docs
     except CSVError:
         retval = 'excel'
     finally:
         f.seek(
             0)  # The above f.read moves the file-cursor in the CSV file.
     return retval
Пример #8
0
def getDelimiter(path):
    sniffer = Sniffer()
    with open(path, 'r') as rfile:
        header = rfile.readline()
        sample = header + rfile.readline() + rfile.readline()
    try:
        asniff = sniffer.sniff(sample, delimiters=";, ")
    except Exception:

        class tsniff(object):
            lineterminator = "\n"
            delimiter = ","

        asniff = tsniff()
        asniff.lineterminator = "\n"
    return asniff.delimiter, sniffer.has_header(sample)
Пример #9
0
    def read(file):

        with open(file, "r") as csv:

            sniffer = Sniffer()
            sample = csv.read(4096)
            dialect = sniffer.sniff(sample, delimiters=[';', ','])

            csv.seek(0)

            lines_reader = DictReader(csv, dialect=dialect)

            lines = []
            for line in lines_reader:
                lines.append(line)

            return lines, lines_reader.fieldnames
Пример #10
0
def p_csv(dialect: Optional[str], padding: bool) -> int:
    data = stdin.read()
    joe_biden = Sniffer()
    has_header = joe_biden.has_header(data)

    try:
        if not has_header:
            print(data, end="")
            return 0
        else:
            d = dialect or joe_biden.sniff(data)
            r = _read(data, dialect=d, padding=padding)
            w = writer(stdout, dialect=d)
            w.writerows(r)
    except CSVErr as e:
        log.critical("%s", f"{ERROR}{linesep}{e}")
        return 1
    else:
        return 0
Пример #11
0
    def read(filename):

        with open(filename, "r") as csvfile:

            sniffer = Sniffer()
            sample = csvfile.read(1024)
            dialect = sniffer.sniff(sample, delimiters=[';', ','])

            if sniffer.has_header(sample):
                # file has header
                pass

            csvfile.seek(0)

            lines_reader = DictReader(csvfile, dialect=dialect)

            lines = []
            for line in lines_reader:
                lines.append(line)

            return lines
Пример #12
0
    def read(filename):

        with open(filename, "r") as csvfile:

            sniffer = Sniffer()
            sample = csvfile.read(4096)
            dialect = sniffer.sniff(sample, delimiters=[';', ','])

            if sniffer.has_header(sample):
                # file has header
                pass

            csvfile.seek(0)

            lines_reader = DictReader(csvfile, dialect=dialect)

            lines = []
            for line in lines_reader:
                lines.append(line)

            return lines, lines_reader.fieldnames
Пример #13
0
    def _read_file_safe(
        self, some_file: File, ignore_headers: bool
    ) -> Generator[Either[Message, SomeModel], None, None]:
        sniffer = Sniffer()
        try:
            with some_file as csv:
                dialect = sniffer.sniff(csv.read(1024))

                csv.seek(0)

                reader = DictReader(f=csv,
                                    fieldnames=self._fields,
                                    dialect=dialect)

                yield from self._read(reader, ignore_headers)
        except Exception as e:
            message = Message(category=MessageCategory.ERROR,
                              key='import_csv_generic_error',
                              args=[e])

            yield Left([message])
Пример #14
0
 def show_delimeter(self):
     sniffer = Sniffer()
     sniffer.preferred = [',', '|', ';', ':', '::', '||']
     dialect = sniffer.sniff(self.show_line())
     self.deli = dialect.delimiter
     return self.deli
Пример #15
0
    def guess_dialect(sample):
        sniffer = Sniffer()
        sample_text = "".join(sample)
        try:
            dialect = sniffer.sniff(sample_text)
            has_header = sniffer.has_header(sample_text)
            if re.match(
                    "[a-z]", dialect.delimiter.lower()
            ):  # we do not allow letters to be delimiters, seems like non-sense
                raise Error
        except Error:  # delimiter failed – maybe there is an empty column: "89.187.1.81,06-05-2016,,CZ,botnet drone"
            if sample_text.strip() == "":
                print("The file seems empty")
                quit()

            # header detection
            l = [line.strip() for line in sample]
            if len(l[1:]) > 0:
                header_to_rows_similarity = mean(
                    [SequenceMatcher(None, l[0], it).ratio() for it in l[1:]])
                if len(l[1:]) > 1:
                    rows_similarity = mean([
                        SequenceMatcher(None, *comb).ratio()
                        for comb in itertools.combinations(l[1:], 2)
                    ])
                    has_header = rows_similarity > header_to_rows_similarity + 0.1  # it seems that first line differs -> header
                else:
                    has_header = header_to_rows_similarity < 0.5
            else:
                has_header = False

            try:
                s = sample[
                    1]  # we dont take header (there is no empty column for sure)
            except IndexError:  # there is a single line in the file
                s = sample[0]
            delimiter = ""
            for dl in (",", ";",
                       "|"):  # lets suppose the doubled sign is delimiter
                if s.find(dl + dl) > -1:
                    delimiter = dl
                    break
            if not delimiter:  # try find anything that resembles to a delimiter
                for dl in (",", ";", "|"):
                    if s.find(dl) > -1:
                        delimiter = dl
                        break
            dialect = csv.unix_dialect
            if delimiter:
                dialect.delimiter = delimiter
        if not dialect.escapechar:
            dialect.escapechar = '\\'
        # dialect.quoting = 3
        dialect.doublequote = True

        seems_single = False
        if len(sample) == 1:
            # there is single line in sample = in the input, so this is definitely not a header
            has_header = False
            if dialect.delimiter not in [".", ",", "\t"
                                         ] and "|" not in sample_text:
                # usecase: short one-line like "convey hello" would produce stupid "l" delimiter
                # XX should be None maybe, let's think a whole row is a single column – but then we could not add columns
                dialect.delimiter = "|"
                seems_single = True
        if dialect.delimiter == "." and "," not in sample_text:
            # let's propose common use case (bare list of IP addresses) over a strange use case with "." delimiting
            dialect.delimiter = ","
        return dialect, has_header, seems_single
Пример #16
0
def validate_file(path: str, task: str, file_ext: str, col_mapping: Dict[str,
                                                                         str]):
    file_name = os.path.basename(path)
    if file_ext in ("csv", "tsv"):
        if task == "entity_extraction":
            raise InvalidFileError(
                f"AutoNLP does not support '{file_ext}' files for entity_extraction tasks. Use .json or .jsonl files!"
            )
        sniffer = Sniffer()
        with open(path, encoding="utf-8") as f:
            sample = "\n".join([f.readline() for _ in range(500)])

        # Validate delimiter
        expected_delimiter = "\t" if file_ext == "tsv" else ","
        actual_delimiter = sniffer.sniff(sample, delimiters=",;\t").delimiter

        if actual_delimiter != expected_delimiter:
            if task == "entity_extraction":
                additional_help = (
                    "\nFor entity_extraction tasks, AutoNLP expects tokens / tags to be tab-separated "
                    "and sentences to be empty-line separated.")
            else:
                additional_help = ""
            raise InvalidFileError(
                "Incorrect delimiter '" +
                (r"\t" if actual_delimiter == "\t" else actual_delimiter) +
                f"' for file '{file_name}'! " + "Expected delimiter is: '" +
                (r"\t" if expected_delimiter == "\t" else actual_delimiter) +
                "'." + additional_help)

        # Extract column_names
        column_names = sample.splitlines()[0].split(actual_delimiter)

    elif file_ext in ("json", "jsonl"):
        with open(path, encoding="utf-8") as f:
            first_line = f.readline()
            second_line = f.readline()
        try:
            json.loads(first_line)
            json.loads(second_line)
        except ValueError:
            raise InvalidFileError(
                f"File `{file_name}` is not a valid JSON-lines file! Each line must be a valid JSON object."
            )

        # Extract column_names
        column_names = list(json.loads(first_line).keys())

    else:
        raise InvalidFileError(
            f"AutoNLP does not support `.{file_ext}` files yet!")

    invalid_columns_source = set(col_mapping.keys()) - set(column_names)
    if invalid_columns_source:
        raise InvalidColMappingError(
            "Columns " +
            ",".join([f"'{col_name}'"
                      for col_name in invalid_columns_source]) +
            " could not be found in the provided file (which has columns: " +
            ",".join([f"'{col_name}'" for col_name in column_names]) + ")")

    invalid_columns_target = set(COLUMNS_PER_TASK[task]) - set(
        col_mapping.values())
    if invalid_columns_target:
        raise InvalidColMappingError(
            "\n".join(["Provided column mapping is:"] + [
                f"   '{src_col}' -> '{dst_col}'"
                for src_col, dst_col in col_mapping.items()
            ] + ["While expecting column mapping like:"] + [
                f"   'original_col_name' -> '{col_name}' (AutoNLP column name)"
                for col_name in COLUMNS_PER_TASK[task]
            ]))
Пример #17
0
    def detect_csv_delimiter(str_array: str) -> str:
        sniffer = Sniffer()
        dialect = sniffer.sniff(str_array)

        return dialect.delimiter