def test_column_stats(self): # build analyser table data = "c1,c2,c3,c4\n" \ "1,0.1,1,\n" \ "2,5.1,2,\n" \ "3,5.2,1,\n" \ "4,5.6,3,\n" \ "5,,4,\n" \ "6,19,6," data_tables = tablemagician.from_file_object(StringIO.StringIO(data)) self.analyser_table = data_tables[0].process() data_tables[0].close() a = ComplexTypeAnalyser() b = ColumnStatsAnalyser() analyser_chain = [a, b] # build engine engine = AnalyserEngine(analyser_chain) # feed with analyser table engine.process(self.analyser_table) stats = self.analyser_table.analysers[ColumnStatsAnalyser.name] self.assertEqual(stats[0]['regression'], 'INCREASE/LINEAR/1.0') self.assertEqual(stats[1]['regression'], 'INCREASE/MONOTONIC') self.assertTrue('regression' not in stats[2])
def feature_extraction(filename): if filename.endswith('.gz'): f = gzip.open(filename, 'rb') else: f = open(filename, 'rb') # read first rows of table tables = tablemagician.from_file_object(f, filename) if len(tables) == 0: raise ValueError('No table: ' + filename) for table in tables: #analyser_table = table.process(max_lines=MAX_ROWS) num_of_columns = len(table.headers) header = [h.lower() for h in table.headers] types = [t.result_type for t in table.types] numeric = types.count(int) + types.count(decimal.Decimal) strings = types.count(basestring) # add feature f.close() feature = [num_of_columns, numeric, strings] break # reopen file after iteration if filename.endswith('.gz'): f = gzip.open(filename, 'rb') else: f = open(filename, 'rb') number_lines = sum(1 for _ in f) f.close() return [number_lines] + feature
def write_analyser_table_to_file(f, filename, out_dir, analyser_engine): data_tables = tablemagician.from_file_object(f, filename) for dt in data_tables: analyser_table = dt.process(max_lines=MAX_LINES) # feed with analyser table analyser_engine.process(analyser_table) # write analyser table in out dir out_path = os.path.join(out_dir, filename + '.pkl') with open(out_path, 'wb') as h: pickle.dump(analyser_table, h) dt.close()
def filter_out_files(rootdir, outdir): for subdir, dirs, files in os.walk(rootdir): for file in files: try: filename = os.path.join(rootdir, file) # unzip files if filename.endswith('.gz'): f = gzip.open(filename, 'rb') # count all lines number_lines = sum(1 for _ in f) f.close() # open again after iteration f = gzip.open(filename, 'rb') # read first rows of table tables = tablemagician.from_file_object(f, filename) for table in tables: #analyser_table = table.process(max_lines=MAX_ROWS) # columns num_of_columns = len(table.headers) # types types = [t.result_type for t in table.types] # filter criteria, bigger sized tables if number_lines > 50 and num_of_columns > 6: if len(types) == types.count(basestring): # pure string tables str_dir = os.path.join(outdir, 'all_string') if not os.path.exists(str_dir): os.mkdir(str_dir) shutil.copyfile(filename, os.path.join(str_dir, file)) elif basestring in types and (types.count(int) + types.count(decimal.Decimal)) >= len(types)/2: # containing a string column and multiple numerical columns num_dir = os.path.join(outdir, 'some_numeric') if not os.path.exists(num_dir): os.mkdir(num_dir) shutil.copyfile(filename, os.path.join(num_dir, file)) table.close() except Exception as e: traceback.print_exc() print e
def test_type_detection(self): # build analyser table data = "c1,c2,c3,c4\n" \ "12cm,3%,€300.50,\n" \ "1 cm,1%,€ 12.345,\n" \ "1.5 cm,0.5%,€ 130,34.2\n" \ "1 cm,1%,€ 12.345,\n" \ "1.5 cm,0.5%,€ 130,34.2\n" \ "1.5 cm,0.5%,€ 130.1000," data_tables = tablemagician.from_file_object(StringIO.StringIO(data)) self.analyser_table = data_tables[0].process() data_tables[0].close() a = ComplexTypeAnalyser() b = ColumnStatsAnalyser() analyser_chain = [a, b] # build engine engine = AnalyserEngine(analyser_chain) # feed with analyser table engine.process(self.analyser_table) columns = self.analyser_table.analysers[ComplexTypeAnalyser.name] for t in columns[0]: self.assertTrue(t.startswith('NUMALPHA')) self.assertTrue(columns[0]['NUMALPHA/NUMBER/INT:1+-ALPHA:2'] == 2) for t in columns[1]: self.assertTrue(t.startswith('NUMALPHA')) self.assertTrue(columns[1]['NUMALPHA/NUMBER/FLOAT:1.1-ALPHA+:1'] == 3) for t in columns[2]: self.assertTrue(t.startswith('ALPHANUM')) self.assertTrue(columns[2]['ALPHANUM/ALPHA+:1-NUMBER/FLOAT:2.3'] == 2) self.assertTrue(columns[2]['ALPHANUM/ALPHA+:1-NUMBER/FLOAT:3.*'] == 1) self.assertTrue(columns[3]['EMPTY'] == 4) for stats in self.analyser_table.analysers[ColumnStatsAnalyser.name]: print 'ColStats:', stats
def build_by_header(header_set, rootdir, dest_dir): for subdir, dirs, files in os.walk(rootdir): for file in files: try: filename = os.path.join(rootdir, file) # unzip files if filename.endswith('.gz'): f = gzip.open(filename, 'rb') else: f = open(filename, 'rb') datatables = tablemagician.from_file_object(f, file) for dt in datatables: intersec = set(header_set) & set([h.lower().strip() for h in dt.headers]) if len(intersec) > 0: train_dir = os.path.join(dest_dir, intersec.pop()) if not os.path.exists(train_dir): os.mkdir(train_dir) shutil.copyfile(filename, os.path.join(train_dir, file)) f.close() except Exception as e: print(file + ' - ' + str(e)) print(traceback.format_exc())
def read_files(path, d): train_set = [] train_dir = join(path, d) onlyfiles = [join(train_dir, file) for file in listdir(train_dir) if isfile(join(train_dir, file))] for filename in onlyfiles: try: # unzip files if filename.endswith('.gz'): f = gzip.open(filename, 'rb') else: f = open(filename, 'rb') datatables = tablemagician.from_file_object(f, filename) for dt in datatables: analyser_table = dt.process(max_lines=MAX_LINES) for i, h in enumerate(analyser_table.headers): if d == h.lower().strip(): train_set.append({'values': analyser_table.columns[i], 'name': filename}) f.close() except Exception as e: print(filename + ' - ' + str(e)) print(traceback.format_exc()) with open(join(path, d + '.pkl'), 'wb') as handle: pickle.dump(train_set, handle)
def main(): args = arg_parser() rootdir = args.in_dir outdir = args.out_dir if args.filter: filter_out_files(rootdir, outdir) num_of_rows = defaultdict(int) num_of_columns = defaultdict(int) header = defaultdict(int) types = defaultdict(int) column_types = defaultdict(int) start = time.time() row_counting_time = 0 i = 0 for subdir, dirs, files in os.walk(rootdir): for file in files: try: filename = os.path.join(rootdir, file) # unzip files if filename.endswith('.gz'): f = gzip.open(filename, 'rb') # count all lines start_row_counting = time.time() number_lines = sum(1 for _ in f) end_row_counting = time.time() row_counting_time += end_row_counting - start_row_counting num_of_rows[number_lines] += 1 f.close() # open again after iteration f = gzip.open(filename, 'rb') # read first rows of table tables = tablemagician.from_file_object(f, filename) for table in tables: #analyser_table = table.process(max_lines=MAX_ROWS) # columns num_of_columns[len(table.headers)] += 1 # header for h in table.headers: header[h.lower().strip()] += 1 table_type = type_classification([t.result_type for t in table.types], column_types) types[table_type] += 1 table.close() i += 1 except Exception as e: traceback.print_exc() print e if i % 100 == 0: print i, 'files processed' with open('rows.csv', 'wb') as f: f.write('rows,count\n') for r in sorted(num_of_rows, key=num_of_rows.get, reverse=True): f.write(str(r) + ',' + str(num_of_rows[r]) + '\n') with open('columns.csv', 'wb') as f: f.write('columns,count\n') for r in sorted(num_of_columns, key=num_of_columns.get, reverse=True): f.write(str(r) + ',' + str(num_of_columns[r]) + '\n') with io.open('header.csv', 'wt') as f: f.write(u'header,count\n') for r in sorted(header, key=header.get, reverse=True): try: f.write(unicode(r) + ',' + unicode(header[r]) + u'\n') except Exception as e: print e with open('types.csv', 'wb') as f: f.write('type,count\n') for r in sorted(types, key=types.get, reverse=True): f.write(str(r) + ',' + str(types[r]) + '\n') with open('column_types.csv', 'wb') as f: f.write('column_type,count\n') for r in sorted(column_types, key=column_types.get, reverse=True): f.write(str(r) + ',' + str(column_types[r]) + '\n') end = time.time() print 'total time:', end - start print 'row counting time:', row_counting_time
if __name__ == '__main__': path = sys.argv[1] headers = sys.argv[2:] train_sets = {} for h in headers: with open(join(path, h + '.pkl'), 'rb') as handle: train_sets[h] = pickle.load(handle) data = build_data_frame(train_sets) data = data.reindex(numpy.random.permutation(data.index)) pipeline = Pipeline([ ('count_vectorizer', CountVectorizer(ngram_range=(1, 2))), ('classifier', MultinomialNB()) ]) pipeline.fit(data['text'].values, data['class'].values) # build example data # country ex = 'https%3A%2F%2Fcommondatastorage.googleapis.com%2Fckannet-storage%2F2012-04-17T212635%2Fcommitments-export-final.csv.gz' with gzip.open(join('/home/sebastian/csv_stats/bandersnatch', ex)) as handle: dts = tablemagician.from_file_object(handle) for dt in dts: at = dt.process(max_lines=100) col_str = ' '.join([str(c.value) for c in at.columns[at.headers.index('Country')] if c.value]) print col_str predictions = pipeline.predict([col_str]) print 'prediction' print predictions # [1, 0]