def guess_dialect(self, sample): sniffer = Sniffer() try: dialect = sniffer.sniff(sample) has_header = sniffer.has_header(sample) except Error: # delimiter failed – maybe there is an empty column: "89.187.1.81,06-05-2016,,CZ,botnet drone" has_header = False # lets just guess the value s = sample.split("\n")[ 1] # we dont take header (there is no empty column for sure) delimiter = "" for dl in (",", ";", "|"): # lets suppose the doubled sign is delimiter if s.find(dl + dl) > -1: delimiter = dl break if not delimiter: # try find anything that ressembles delimiter for dl in (",", ";", "|"): if s.find(dl) > -1: delimiter = dl break dialect = csv.unix_dialect dialect.delimiter = delimiter if not dialect.escapechar: dialect.escapechar = '\\' # dialect.quoting = 3 dialect.doublequote = True return dialect, has_header
def guess_dialect(self, sample): sniffer = Sniffer() try: dialect = sniffer.sniff(sample) has_header = sniffer.has_header(sample) except Error: # delimiter failed – maybe there is an empty column: "89.187.1.81,06-05-2016,,CZ,botnet drone" if sample.strip() == "": print("The file seems empty") quit() has_header = False # lets just guess the value try: s = sample.split("\n")[1] # we dont take header (there is no empty column for sure) except IndexError: # there is a single line in the file s = sample.split("\n")[0] delimiter = "" for dl in (",", ";", "|"): # lets suppose the doubled sign is delimiter if s.find(dl + dl) > -1: delimiter = dl break if not delimiter: # try find anything that ressembles delimiter for dl in (",", ";", "|"): if s.find(dl) > -1: delimiter = dl break dialect = csv.unix_dialect dialect.delimiter = delimiter if not dialect.escapechar: dialect.escapechar = '\\' # dialect.quoting = 3 dialect.doublequote = True return dialect, has_header
def __init__(self, inFile): from csv import Sniffer, reader csvFile = open(inFile, 'r') sample = csvFile.read(1024) csvFile.seek(0) self.reader = reader(csvFile, Sniffer.sniff(sample)) if Sniffer.has_header(sample): self.varNames = next(self.reader) else: self.varNames = None
def getDelimiter(path): sniffer = Sniffer() with open(path, 'r') as rfile: header = rfile.readline() sample = header + rfile.readline() + rfile.readline() try: asniff = sniffer.sniff(sample, delimiters=";, ") except Exception: class tsniff(object): lineterminator = "\n" delimiter = "," asniff = tsniff() asniff.lineterminator = "\n" return asniff.delimiter, sniffer.has_header(sample)
def p_csv(dialect: Optional[str], padding: bool) -> int: data = stdin.read() joe_biden = Sniffer() has_header = joe_biden.has_header(data) try: if not has_header: print(data, end="") return 0 else: d = dialect or joe_biden.sniff(data) r = _read(data, dialect=d, padding=padding) w = writer(stdout, dialect=d) w.writerows(r) except CSVErr as e: log.critical("%s", f"{ERROR}{linesep}{e}") return 1 else: return 0
def read(filename): with open(filename, "r") as csvfile: sniffer = Sniffer() sample = csvfile.read(1024) dialect = sniffer.sniff(sample, delimiters=[';', ',']) if sniffer.has_header(sample): # file has header pass csvfile.seek(0) lines_reader = DictReader(csvfile, dialect=dialect) lines = [] for line in lines_reader: lines.append(line) return lines
def read(filename): with open(filename, "r") as csvfile: sniffer = Sniffer() sample = csvfile.read(4096) dialect = sniffer.sniff(sample, delimiters=[';', ',']) if sniffer.has_header(sample): # file has header pass csvfile.seek(0) lines_reader = DictReader(csvfile, dialect=dialect) lines = [] for line in lines_reader: lines.append(line) return lines, lines_reader.fieldnames
def guess_dialect(sample): sniffer = Sniffer() sample_text = "".join(sample) try: dialect = sniffer.sniff(sample_text) has_header = sniffer.has_header(sample_text) if re.match( "[a-z]", dialect.delimiter.lower() ): # we do not allow letters to be delimiters, seems like non-sense raise Error except Error: # delimiter failed – maybe there is an empty column: "89.187.1.81,06-05-2016,,CZ,botnet drone" if sample_text.strip() == "": print("The file seems empty") quit() # header detection l = [line.strip() for line in sample] if len(l[1:]) > 0: header_to_rows_similarity = mean( [SequenceMatcher(None, l[0], it).ratio() for it in l[1:]]) if len(l[1:]) > 1: rows_similarity = mean([ SequenceMatcher(None, *comb).ratio() for comb in itertools.combinations(l[1:], 2) ]) has_header = rows_similarity > header_to_rows_similarity + 0.1 # it seems that first line differs -> header else: has_header = header_to_rows_similarity < 0.5 else: has_header = False try: s = sample[ 1] # we dont take header (there is no empty column for sure) except IndexError: # there is a single line in the file s = sample[0] delimiter = "" for dl in (",", ";", "|"): # lets suppose the doubled sign is delimiter if s.find(dl + dl) > -1: delimiter = dl break if not delimiter: # try find anything that resembles to a delimiter for dl in (",", ";", "|"): if s.find(dl) > -1: delimiter = dl break dialect = csv.unix_dialect if delimiter: dialect.delimiter = delimiter if not dialect.escapechar: dialect.escapechar = '\\' # dialect.quoting = 3 dialect.doublequote = True seems_single = False if len(sample) == 1: # there is single line in sample = in the input, so this is definitely not a header has_header = False if dialect.delimiter not in [".", ",", "\t" ] and "|" not in sample_text: # usecase: short one-line like "convey hello" would produce stupid "l" delimiter # XX should be None maybe, let's think a whole row is a single column – but then we could not add columns dialect.delimiter = "|" seems_single = True if dialect.delimiter == "." and "," not in sample_text: # let's propose common use case (bare list of IP addresses) over a strange use case with "." delimiting dialect.delimiter = "," return dialect, has_header, seems_single