def guess_dialect(self, sample): sniffer = Sniffer() try: dialect = sniffer.sniff(sample) has_header = sniffer.has_header(sample) except Error: # delimiter failed – maybe there is an empty column: "89.187.1.81,06-05-2016,,CZ,botnet drone" has_header = False # lets just guess the value s = sample.split("\n")[ 1] # we dont take header (there is no empty column for sure) delimiter = "" for dl in (",", ";", "|"): # lets suppose the doubled sign is delimiter if s.find(dl + dl) > -1: delimiter = dl break if not delimiter: # try find anything that ressembles delimiter for dl in (",", ";", "|"): if s.find(dl) > -1: delimiter = dl break dialect = csv.unix_dialect dialect.delimiter = delimiter if not dialect.escapechar: dialect.escapechar = '\\' # dialect.quoting = 3 dialect.doublequote = True return dialect, has_header
def guess_dialect(self, sample): sniffer = Sniffer() try: dialect = sniffer.sniff(sample) has_header = sniffer.has_header(sample) except Error: # delimiter failed – maybe there is an empty column: "89.187.1.81,06-05-2016,,CZ,botnet drone" if sample.strip() == "": print("The file seems empty") quit() has_header = False # lets just guess the value try: s = sample.split("\n")[1] # we dont take header (there is no empty column for sure) except IndexError: # there is a single line in the file s = sample.split("\n")[0] delimiter = "" for dl in (",", ";", "|"): # lets suppose the doubled sign is delimiter if s.find(dl + dl) > -1: delimiter = dl break if not delimiter: # try find anything that ressembles delimiter for dl in (",", ";", "|"): if s.find(dl) > -1: delimiter = dl break dialect = csv.unix_dialect dialect.delimiter = delimiter if not dialect.escapechar: dialect.escapechar = '\\' # dialect.quoting = 3 dialect.doublequote = True return dialect, has_header
def __init__(self, inFile): from csv import Sniffer, reader csvFile = open(inFile, 'r') sample = csvFile.read(1024) csvFile.seek(0) self.reader = reader(csvFile, Sniffer.sniff(sample)) if Sniffer.has_header(sample): self.varNames = next(self.reader) else: self.varNames = None
def converter(): if not session['extension'] or not session['content_type']: flash('Unsupported file type', 'info') return redirect(url_for('.dropzone')) session['processed'] = False session['outputs'] = mkdtemp() is_csv = (session['extension'] == '.csv' or session['content_type'].startswith('text/csv')) is_excel = session['extension'] in ['.xls', '.xlsx'] or any( s in session['content_type'] for s in ['spreadsheet', 'xls', 'xlsx', 'excel']) is_text = (session['extension'] == '.txt' or session['content_type'].startswith('text/')) if is_csv: file_name = listdir(session['tmp_dir'])[0] file_path = join(session['tmp_dir'], file_name) # guess file encoding encoding = get_encoding(file_path) # guess separator with open(file_path, encoding=encoding) as f: sniffer = Sniffer() line = f.readline().encode(encoding).decode('utf-8') dialect = sniffer.sniff(line) df = pd.read_csv(file_path, encoding=encoding, dialect=dialect) session['fields'] = df.columns.tolist() elif is_excel: file_name = listdir(session['tmp_dir'])[0] file_path = join(session['tmp_dir'], file_name) df = pd.read_excel(file_path, encoding='utf-8') session['fields'] = df.columns.tolist() elif is_text: session['fields'] = ['id', 'text'] dataset_json = texts_to_json(session['tmp_dir']) df = DataFrame(dataset_json) else: flash('Uploaded file types have not been recognized') return redirect(url_for('.dropzone')) df.to_csv(join(session['outputs'], 'original.csv'), index=False, encoding='utf-8') return redirect(url_for('.field_selection_get'))
def guess_dialect(f): # Taken from the Python standard docs, with thanks to Piers Goodhew <*****@*****.**> # <https://docs.python.org/2/library/csv.html#csv.Sniffer> s = Sniffer() try: retval = s.sniff(f.read(1024), [',', '\t', ]) # 1024 taken from the Python docs except CSVError: retval = 'excel' finally: f.seek(0) # The above f.read moves the file-cursor in the CSV file. return retval
def guess_dialect(f): # Taken from the Python standard docs, with thanks to Piers Goodhew <*****@*****.**> # <https://docs.python.org/2/library/csv.html#csv.Sniffer> s = Sniffer() try: retval = s.sniff(f.read(1024), [ ',', '\t', ]) # 1024 taken from the Python docs except CSVError: retval = 'excel' finally: f.seek( 0) # The above f.read moves the file-cursor in the CSV file. return retval
def getDelimiter(path): sniffer = Sniffer() with open(path, 'r') as rfile: header = rfile.readline() sample = header + rfile.readline() + rfile.readline() try: asniff = sniffer.sniff(sample, delimiters=";, ") except Exception: class tsniff(object): lineterminator = "\n" delimiter = "," asniff = tsniff() asniff.lineterminator = "\n" return asniff.delimiter, sniffer.has_header(sample)
def read(file): with open(file, "r") as csv: sniffer = Sniffer() sample = csv.read(4096) dialect = sniffer.sniff(sample, delimiters=[';', ',']) csv.seek(0) lines_reader = DictReader(csv, dialect=dialect) lines = [] for line in lines_reader: lines.append(line) return lines, lines_reader.fieldnames
def p_csv(dialect: Optional[str], padding: bool) -> int: data = stdin.read() joe_biden = Sniffer() has_header = joe_biden.has_header(data) try: if not has_header: print(data, end="") return 0 else: d = dialect or joe_biden.sniff(data) r = _read(data, dialect=d, padding=padding) w = writer(stdout, dialect=d) w.writerows(r) except CSVErr as e: log.critical("%s", f"{ERROR}{linesep}{e}") return 1 else: return 0
def read(filename): with open(filename, "r") as csvfile: sniffer = Sniffer() sample = csvfile.read(1024) dialect = sniffer.sniff(sample, delimiters=[';', ',']) if sniffer.has_header(sample): # file has header pass csvfile.seek(0) lines_reader = DictReader(csvfile, dialect=dialect) lines = [] for line in lines_reader: lines.append(line) return lines
def read(filename): with open(filename, "r") as csvfile: sniffer = Sniffer() sample = csvfile.read(4096) dialect = sniffer.sniff(sample, delimiters=[';', ',']) if sniffer.has_header(sample): # file has header pass csvfile.seek(0) lines_reader = DictReader(csvfile, dialect=dialect) lines = [] for line in lines_reader: lines.append(line) return lines, lines_reader.fieldnames
def _read_file_safe( self, some_file: File, ignore_headers: bool ) -> Generator[Either[Message, SomeModel], None, None]: sniffer = Sniffer() try: with some_file as csv: dialect = sniffer.sniff(csv.read(1024)) csv.seek(0) reader = DictReader(f=csv, fieldnames=self._fields, dialect=dialect) yield from self._read(reader, ignore_headers) except Exception as e: message = Message(category=MessageCategory.ERROR, key='import_csv_generic_error', args=[e]) yield Left([message])
def show_delimeter(self): sniffer = Sniffer() sniffer.preferred = [',', '|', ';', ':', '::', '||'] dialect = sniffer.sniff(self.show_line()) self.deli = dialect.delimiter return self.deli
def guess_dialect(sample): sniffer = Sniffer() sample_text = "".join(sample) try: dialect = sniffer.sniff(sample_text) has_header = sniffer.has_header(sample_text) if re.match( "[a-z]", dialect.delimiter.lower() ): # we do not allow letters to be delimiters, seems like non-sense raise Error except Error: # delimiter failed – maybe there is an empty column: "89.187.1.81,06-05-2016,,CZ,botnet drone" if sample_text.strip() == "": print("The file seems empty") quit() # header detection l = [line.strip() for line in sample] if len(l[1:]) > 0: header_to_rows_similarity = mean( [SequenceMatcher(None, l[0], it).ratio() for it in l[1:]]) if len(l[1:]) > 1: rows_similarity = mean([ SequenceMatcher(None, *comb).ratio() for comb in itertools.combinations(l[1:], 2) ]) has_header = rows_similarity > header_to_rows_similarity + 0.1 # it seems that first line differs -> header else: has_header = header_to_rows_similarity < 0.5 else: has_header = False try: s = sample[ 1] # we dont take header (there is no empty column for sure) except IndexError: # there is a single line in the file s = sample[0] delimiter = "" for dl in (",", ";", "|"): # lets suppose the doubled sign is delimiter if s.find(dl + dl) > -1: delimiter = dl break if not delimiter: # try find anything that resembles to a delimiter for dl in (",", ";", "|"): if s.find(dl) > -1: delimiter = dl break dialect = csv.unix_dialect if delimiter: dialect.delimiter = delimiter if not dialect.escapechar: dialect.escapechar = '\\' # dialect.quoting = 3 dialect.doublequote = True seems_single = False if len(sample) == 1: # there is single line in sample = in the input, so this is definitely not a header has_header = False if dialect.delimiter not in [".", ",", "\t" ] and "|" not in sample_text: # usecase: short one-line like "convey hello" would produce stupid "l" delimiter # XX should be None maybe, let's think a whole row is a single column – but then we could not add columns dialect.delimiter = "|" seems_single = True if dialect.delimiter == "." and "," not in sample_text: # let's propose common use case (bare list of IP addresses) over a strange use case with "." delimiting dialect.delimiter = "," return dialect, has_header, seems_single
def validate_file(path: str, task: str, file_ext: str, col_mapping: Dict[str, str]): file_name = os.path.basename(path) if file_ext in ("csv", "tsv"): if task == "entity_extraction": raise InvalidFileError( f"AutoNLP does not support '{file_ext}' files for entity_extraction tasks. Use .json or .jsonl files!" ) sniffer = Sniffer() with open(path, encoding="utf-8") as f: sample = "\n".join([f.readline() for _ in range(500)]) # Validate delimiter expected_delimiter = "\t" if file_ext == "tsv" else "," actual_delimiter = sniffer.sniff(sample, delimiters=",;\t").delimiter if actual_delimiter != expected_delimiter: if task == "entity_extraction": additional_help = ( "\nFor entity_extraction tasks, AutoNLP expects tokens / tags to be tab-separated " "and sentences to be empty-line separated.") else: additional_help = "" raise InvalidFileError( "Incorrect delimiter '" + (r"\t" if actual_delimiter == "\t" else actual_delimiter) + f"' for file '{file_name}'! " + "Expected delimiter is: '" + (r"\t" if expected_delimiter == "\t" else actual_delimiter) + "'." + additional_help) # Extract column_names column_names = sample.splitlines()[0].split(actual_delimiter) elif file_ext in ("json", "jsonl"): with open(path, encoding="utf-8") as f: first_line = f.readline() second_line = f.readline() try: json.loads(first_line) json.loads(second_line) except ValueError: raise InvalidFileError( f"File `{file_name}` is not a valid JSON-lines file! Each line must be a valid JSON object." ) # Extract column_names column_names = list(json.loads(first_line).keys()) else: raise InvalidFileError( f"AutoNLP does not support `.{file_ext}` files yet!") invalid_columns_source = set(col_mapping.keys()) - set(column_names) if invalid_columns_source: raise InvalidColMappingError( "Columns " + ",".join([f"'{col_name}'" for col_name in invalid_columns_source]) + " could not be found in the provided file (which has columns: " + ",".join([f"'{col_name}'" for col_name in column_names]) + ")") invalid_columns_target = set(COLUMNS_PER_TASK[task]) - set( col_mapping.values()) if invalid_columns_target: raise InvalidColMappingError( "\n".join(["Provided column mapping is:"] + [ f" '{src_col}' -> '{dst_col}'" for src_col, dst_col in col_mapping.items() ] + ["While expecting column mapping like:"] + [ f" 'original_col_name' -> '{col_name}' (AutoNLP column name)" for col_name in COLUMNS_PER_TASK[task] ]))
def detect_csv_delimiter(str_array: str) -> str: sniffer = Sniffer() dialect = sniffer.sniff(str_array) return dialect.delimiter