def upload_file(): if request.method == 'POST': # check if the post request has the file part if 'csv' in request.files: # get number of neighbours, default=10 neighbours = request.args.get('neighbours', '10') if not isInt(neighbours): flash( 'Invalid number of neighbours. Use "neighbours={count}". Default is 10.' ) flask.abort(422, '\n'.join(flask.get_flashed_messages())) column = request.args.get('column', '0') if not isInt(column): flash( 'Invalid column index specified. Use "column={index}" parameter. Default is 0' ) flask.abort(422, '\n'.join(flask.get_flashed_messages())) file = request.files['csv'] # if user does not select file, browser also # submit a empty part without filename if file.filename == '': flash('No selected file') flask.abort(400, '\n'.join(flask.get_flashed_messages())) if file: #filename = secure_filename(file.filename) reader = anycsv.reader(content=file.read()) values = [r[int(column)] for r in reader] return get_response(values=values, neighbours=int(neighbours)) flash('Use "csv" parameter to specify file') return flask.abort(400, '\n'.join(flask.get_flashed_messages()))
def parse_csv(csv, out, bench, bench_out): """Inspect a CSV file to figure out about the dialect, comment and header lines and the overall structure.""" click.echo("Input CSV {}".format(csv)) table = anycsv.reader(csv) if out: if out.endswith(".gz"): fout = gzip.open(out, 'wt', newline='') else: fout = open(out, "wt") else: fout=sys.stdout import csv writer = csv.writer(fout) for row in table: writer.writerow(row) if out: fout.close() if bench: click.echo("TIMING") click.echo(Timer.printStats()) if bench_out: Timer.to_csv(bench_out)
def __init__(self, filename=None, url=None, content=None, sample_size=20, skip_guess_encoding=False, structure_detector=SimpleStructureDetector(), max_file_size=-1): self.table = anycsv.reader(filename=filename, url=url, content=content, skip_guess_encoding=skip_guess_encoding, sniff_lines=sample_size, max_file_size=max_file_size) keys = [ 'encoding', 'url', 'filename', 'delimiter', 'quotechar', 'lineterminator', 'skipinitialspace', 'quoting', 'delimiter', 'doublequote' ] self.meta = {} for k, v in self.table.__dict__.items(): if k in keys and v: self.meta[k] = v for k, v in self.table.dialect.items(): if k in keys: self.meta[k] = v if 'url' in self.meta: self.meta['uri'] = self.meta.pop('url') #self.meta['dialect'] = self.table.dialect log.debug("Input file dialect", dialect=self.table.dialect, encoding=self.meta['encoding']) self.sample = [] for i, row in enumerate(self.table): if i >= sample_size: break self.sample.append(row) # i would include emtpy lines for now self.descriptionLines = structure_detector.guess_description_lines( self.sample) if self.descriptionLines is None: raise ValueError( "structure_detector should return a value, if no description lines exists, return empty" ) # allow multple header lines if existing, if none exist return 0 self.header_lines = structure_detector.guess_headers(self.sample) if self.header_lines is None: raise ValueError( "structure_detector should return a value, if no header lines exists, return empty" ) self.columns = structure_detector.guess_columns(self.sample) self.table.seek_line( 0) #len(self.descriptionLines) + len(self.header_lines))
def test_single_file(): csv = "/Users/jumbrich/data/mimesis_csvs/encoding/latin.csv" reader = anycsv.reader(csv) for row in reader: assert len(row) == 9 assert reader.digest is not None
def test_file(tmpdir): p = tmpdir.mkdir("tmp.csvs").mkdir("data") csv = _create_table(p, rows=200, columns=4, gzipped=True) reader = anycsv.reader(csv) for row in reader: assert len(row) == 4 assert reader.digest is not None
def show_similar_columns(similar_columns): print "Found " + str(len(similar_columns)) + " groups of similar columns:" for idx, cluster in enumerate(similar_columns): # write the cluster of similar columns into a separate csv file # df = pd.DataFrame() # df.columns = cluster similar_columns = [] print str(len(cluster)) + " similar columns:" print cluster for column_label in cluster: print "Column: " + column_label file, col_id = column_label.split('_') fp = os.path.join(DATA_PATH, file) # choose first sample file # get the table as a df # df = pd.read_csv(fp) try: csvr = anycsv.reader(filename=fp) # skip first 3 lines to avoid description and header lines h = csvr.next() # h = csvr.next() # h = csvr.next() # while len(h) <= 1: # # possibly description line # h = csvr.next() # setup columns columns = [[] for _ in range(len(h))] for row in csvr: for i, cell in enumerate(row): columns[i].append(cell.encode('utf-8')) # for col_id, c in enumerate(columns): # print col_id column = columns[int(col_id)] print column[:5] similar_columns.append(pd.Series(column)) # add column to the table # adjust the size of the dataframe to accomodate the new column # df = df.assign(column_values=pd.Series(column)) # # label column # df = df.rename(index=str, columns={"column_values": column_label}) # print df # print "of length " + str(len(column)) except Exception as e: print e # print len(df.columns) # print similar_columns df = pd.concat(similar_columns, axis=1).reset_index() df.to_csv(RESULTS_PATH + 'cluster' + str(idx) + '.csv')
def find_similar_hashed_columns(hashbits=8, limit=None): ''' Find similar columns using simhash ''' hashes = {} hash_func = SimHash(hashbits) textual_columns_iterator = get_textual_columns(limit) for i, column_description in textual_columns_iterator: file = column_description['file'] col_id = column_description['column'] label = file + '_' + str(col_id) # print "Column: " + label fp = os.path.join(DATA_PATH, file) # choose first sample file # get the table as a df # df = pd.read_csv(fp) try: csvr = anycsv.reader(filename=fp) # skip first 3 lines to avoid description and header lines h = csvr.next() h = csvr.next() h = csvr.next() while len(h) <= 1: # possibly description line h = csvr.next() # setup columns columns = [[] for _ in range(len(h))] for row in csvr: for i, cell in enumerate(row): columns[i].append(cell) # for col_id, c in enumerate(columns): # print col_id column = columns[col_id] # print "of length " + str(len(column)) hashed_column = hash_column(column, hash_func).hex() if hashed_column not in hashes.keys(): hashes[hashed_column] = [] hashes[hashed_column].append(label) except Exception as e: print e # print hashes similar_columns = [bucket for bucket in hashes.values() if len(bucket) > 1] # print similar_columns return similar_columns
def inspect_csv(csv, bench,bench_out): """Inspect a CSV file to figure out about the dialect, comment and header lines and the overall structure.""" click.echo("Input CSV {}".format(csv)) reader = anycsv.reader(csv) for i,row in enumerate(reader): pass click.echo("{:-^80}".format(" Table Info ")) click.echo(" input: {}".format(reader.csv)) click.echo(" encoding: {}".format(reader.encoding)) click.echo(" md5: {}".format(reader.digest)) click.echo(" dialect:") for k,v in reader.dialect._asdict().items(): click.echo(" {}: {}".format(k,v)) if bench: click.echo("TIMING") click.echo(Timer.printStats()) if bench_out: Timer.to_csv(bench_out)
def __init__(self, filename=None, url=None, content=None, sample_size=20, skip_guess_encoding=False): self.table = anycsv.reader(filename=filename, url=url, content=content, skip_guess_encoding=skip_guess_encoding) # copy dialect information to the returning result object keys = ["encoding", "url", "filename", "delimiter", "quotechar", "columns"] for k, v in self.table.__dict__.items(): if k in keys: if v: setattr(self, k, v) self.sample = [] for i, row in enumerate(self.table): # if len(row) != self.columns: # raise ValueError("Row length of "+str(len(row))+" does not match column length of "+str(self.columns)) if i >= sample_size: break self.sample.append(row) self.description = guess_description_lines(self.sample) if self.description: self.sample = self.sample[self.description :] self.emptyColumns = detect_empty_columns(self.sample) self.columns = len(self.sample[self.description]) self.descriptionLines = [] for i in range(0, self.description): self.descriptionLines.append(self.sample.pop(0)) self.header_line = guess_headers(self.sample, self.emptyColumns) if self.header_line: self.table.seek_line(self.description + 1) else: self.table.seek_line(self.description)
self.length = max(tokens.iteritems(), key=operator.itemgetter(1))[0] if __name__ == '__main__': col_stats = {'labels': defaultdict(int), 'lengths': defaultdict(int), 'columns': 0, 'tables': 0, 'errors': 0} classification = [['file', 'column', 'type', 'avg_tokens']] for root, subdirs, files in os.walk('tables'): for filename in files: if filename.endswith('.csv'): try: path = os.path.join(root, filename) csvr = anycsv.reader(filename=path) # skip first 3 lines to avoid description and header lines h = csvr.next() h = csvr.next() h = csvr.next() while len(h) <= 1: # possibly description line h = csvr.next() # setup columns columns = [[] for _ in range(len(h))] for row in csvr: for i, cell in enumerate(row): columns[i].append(cell) for col_id, c in enumerate(columns): col = Column(c) col_stats['labels'][col.label] += 1
import anycsv import csv #reader = anycsv.reader(filename="data.csv") reader = anycsv.reader( url="https://dev.inpher.io/datasets/correlation/test1/bank-full-X.csv") with open('testfile.csv', 'w') as f: writer = csv.writer(f, delimiter='|') writer.writerows([row for row in reader])
import anycsv import csv # filename = 'path/to/file.csv' # reader = anycsv.reader(filename=filename) # url = 'http://file.csv' # reader = anycsv.reader(url=url) content = 'a,b,c\n1,2,3\n4,5,6' reader = anycsv.reader( url= "https://files.datapress.com/calderdale/dataset/domestic-consumption-monitor---monthly-meter-readings/2016-08-31T11:56:15/Domestic" ) with open('testfile.csv', 'w') as f: writer = csv.writer(f, delimiter='|') writer.writerows([row for row in reader])
import anycsv import csv reader = anycsv.reader(filename="data.csv") # reader = anycsv.reader(url="https://dev.inpher.io/datasets/correlation/test1/bank-full-X.csv") with open('result.csv', 'w') as f: writer = csv.writer(f, delimiter='\t') writer.writerows([row for row in reader])
'labels': defaultdict(int), 'lengths': defaultdict(int), 'columns': 0, 'tables': 0, 'errors': 0 } classification = [['file', 'column', 'type', 'avg_tokens']] for root, subdirs, files in os.walk(PATH): for filename in files: if filename.endswith('.csv'): try: path = os.path.join(root, filename) csvr = anycsv.reader(filename=path) # skip first 3 lines to avoid description and header lines h = csvr.next() h = csvr.next() h = csvr.next() while len(h) <= 1: # possibly description line h = csvr.next() # setup columns columns = [[] for _ in range(len(h))] for row in csvr: for i, cell in enumerate(row): columns[i].append(cell) for col_id, c in enumerate(columns): col = Column(c) col_stats['labels'][col.label] += 1
def from_table(self, filename=None, url=None, content=None, min_matches=0.6, sample_size=300): if not filename and not url and not content: return None sample = [] cols = [] col_types = [] num_cols = 0 i = 0 try: csvr = anycsv.reader(filename=filename, url=url, content=content) except NoDelimiterException: csvr = anycsv.reader(filename=filename, url=url, content=content, delimiter=',') for i, row in enumerate(csvr): if i <= sample_size: sample.append(row) num_cols = len(row) for k, c in enumerate(row): if len(cols) == 0: cols = [[] for _ in range(num_cols)] col_types = [defaultdict(int) for _ in range(num_cols)] if NUTS_PATTERN.match(c): col_types[k]['NUTS'] += 1 elif POSTAL_PATTERN.match(c): col_types[k]['POSTAL'] += 1 cols[k].append(c.strip()) result = ['' for _ in range(num_cols)] disambiguation = [['' for _ in range(sample_size)] for _ in range(num_cols)] for col in range(num_cols): # based on col type (90% threshold) if 'NUTS' in col_types[col] and col_types[col]['NUTS'] >= i * 0.9: disamb, confidence, res_col, source = self.nuts_column( cols[col]) elif 'POSTAL' in col_types[ col] and col_types[col]['POSTAL'] >= i * 0.9: disamb, confidence, res_col = self.postalcodes_column( cols[col]) else: disamb, confidence, res_col, source = self.string_column( cols[col]) if confidence > min_matches: disambiguation[col] = disamb result[col] = res_col return { 'disambiguation': disambiguation, 'sample': sample, 'cols': num_cols, 'rows': i, 'tagging': result }