Пример #1
0
    def test_column_stats(self):
        # build analyser table
        data = "c1,c2,c3,c4\n" \
               "1,0.1,1,\n" \
               "2,5.1,2,\n" \
               "3,5.2,1,\n" \
               "4,5.6,3,\n" \
               "5,,4,\n" \
               "6,19,6,"

        data_tables = tablemagician.from_file_object(StringIO.StringIO(data))
        self.analyser_table = data_tables[0].process()
        data_tables[0].close()

        a = ComplexTypeAnalyser()
        b = ColumnStatsAnalyser()

        analyser_chain = [a, b]
        # build engine
        engine = AnalyserEngine(analyser_chain)
        # feed with analyser table
        engine.process(self.analyser_table)

        stats = self.analyser_table.analysers[ColumnStatsAnalyser.name]
        self.assertEqual(stats[0]['regression'], 'INCREASE/LINEAR/1.0')
        self.assertEqual(stats[1]['regression'], 'INCREASE/MONOTONIC')
        self.assertTrue('regression' not in stats[2])
Пример #2
0
def feature_extraction(filename):
    if filename.endswith('.gz'):
        f = gzip.open(filename, 'rb')
    else:
        f = open(filename, 'rb')

    # read first rows of table
    tables = tablemagician.from_file_object(f, filename)
    if len(tables) == 0:
        raise ValueError('No table: ' + filename)
    for table in tables:
        #analyser_table = table.process(max_lines=MAX_ROWS)

        num_of_columns = len(table.headers)

        header = [h.lower() for h in table.headers]

        types = [t.result_type for t in table.types]

        numeric = types.count(int) + types.count(decimal.Decimal)
        strings = types.count(basestring)
        # add feature
        f.close()
        feature = [num_of_columns, numeric, strings]
        break

    # reopen file after iteration
    if filename.endswith('.gz'):
        f = gzip.open(filename, 'rb')
    else:
        f = open(filename, 'rb')
    number_lines = sum(1 for _ in f)
    f.close()
    return [number_lines] + feature
Пример #3
0
def write_analyser_table_to_file(f, filename, out_dir, analyser_engine):
    data_tables = tablemagician.from_file_object(f, filename)
    for dt in data_tables:
        analyser_table = dt.process(max_lines=MAX_LINES)

        # feed with analyser table
        analyser_engine.process(analyser_table)

        # write analyser table in out dir
        out_path = os.path.join(out_dir, filename + '.pkl')
        with open(out_path, 'wb') as h:
            pickle.dump(analyser_table, h)
    dt.close()
Пример #4
0
def filter_out_files(rootdir, outdir):
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            try:
                filename = os.path.join(rootdir, file)
                # unzip files
                if filename.endswith('.gz'):

                    f = gzip.open(filename, 'rb')
                    # count all lines
                    number_lines = sum(1 for _ in f)
                    f.close()

                    # open again after iteration
                    f = gzip.open(filename, 'rb')
                    # read first rows of table
                    tables = tablemagician.from_file_object(f, filename)
                    for table in tables:
                        #analyser_table = table.process(max_lines=MAX_ROWS)

                        # columns
                        num_of_columns = len(table.headers)
                        # types
                        types = [t.result_type for t in table.types]

                        # filter criteria, bigger sized tables
                        if number_lines > 50 and num_of_columns > 6:
                            if len(types) == types.count(basestring):
                                # pure string tables
                                str_dir = os.path.join(outdir, 'all_string')
                                if not os.path.exists(str_dir):
                                    os.mkdir(str_dir)
                                shutil.copyfile(filename, os.path.join(str_dir, file))
                            elif basestring in types and (types.count(int) + types.count(decimal.Decimal)) >= len(types)/2:
                                # containing a string column and multiple numerical columns
                                num_dir = os.path.join(outdir, 'some_numeric')
                                if not os.path.exists(num_dir):
                                    os.mkdir(num_dir)
                                shutil.copyfile(filename, os.path.join(num_dir, file))

                    table.close()
            except Exception as e:
                traceback.print_exc()
                print e
Пример #5
0
    def test_type_detection(self):
        # build analyser table
        data = "c1,c2,c3,c4\n" \
               "12cm,3%,€300.50,\n" \
               "1 cm,1%,€ 12.345,\n" \
               "1.5 cm,0.5%,€ 130,34.2\n" \
               "1 cm,1%,€ 12.345,\n" \
               "1.5 cm,0.5%,€ 130,34.2\n" \
               "1.5 cm,0.5%,€ 130.1000,"

        data_tables = tablemagician.from_file_object(StringIO.StringIO(data))
        self.analyser_table = data_tables[0].process()
        data_tables[0].close()

        a = ComplexTypeAnalyser()
        b = ColumnStatsAnalyser()

        analyser_chain = [a, b]
        # build engine
        engine = AnalyserEngine(analyser_chain)
        # feed with analyser table
        engine.process(self.analyser_table)

        columns = self.analyser_table.analysers[ComplexTypeAnalyser.name]
        for t in columns[0]:
            self.assertTrue(t.startswith('NUMALPHA'))

        self.assertTrue(columns[0]['NUMALPHA/NUMBER/INT:1+-ALPHA:2'] == 2)

        for t in columns[1]:
            self.assertTrue(t.startswith('NUMALPHA'))

        self.assertTrue(columns[1]['NUMALPHA/NUMBER/FLOAT:1.1-ALPHA+:1'] == 3)

        for t in columns[2]:
            self.assertTrue(t.startswith('ALPHANUM'))

        self.assertTrue(columns[2]['ALPHANUM/ALPHA+:1-NUMBER/FLOAT:2.3'] == 2)
        self.assertTrue(columns[2]['ALPHANUM/ALPHA+:1-NUMBER/FLOAT:3.*'] == 1)
        self.assertTrue(columns[3]['EMPTY'] == 4)

        for stats in self.analyser_table.analysers[ColumnStatsAnalyser.name]:
            print 'ColStats:', stats
Пример #6
0
def build_by_header(header_set, rootdir, dest_dir):
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            try:
                filename = os.path.join(rootdir, file)
                # unzip files
                if filename.endswith('.gz'):
                    f = gzip.open(filename, 'rb')
                else:
                    f = open(filename, 'rb')
                datatables = tablemagician.from_file_object(f, file)
                for dt in datatables:
                    intersec = set(header_set) & set([h.lower().strip() for h in dt.headers])
                    if len(intersec) > 0:
                        train_dir = os.path.join(dest_dir, intersec.pop())
                        if not os.path.exists(train_dir):
                            os.mkdir(train_dir)
                        shutil.copyfile(filename, os.path.join(train_dir, file))
                f.close()
            except Exception as e:
                print(file + ' - ' + str(e))
                print(traceback.format_exc())
Пример #7
0
def read_files(path, d):
    train_set = []
    train_dir = join(path, d)
    onlyfiles = [join(train_dir, file) for file in listdir(train_dir) if isfile(join(train_dir, file))]
    for filename in onlyfiles:
        try:
            # unzip files
            if filename.endswith('.gz'):
                f = gzip.open(filename, 'rb')
            else:
                f = open(filename, 'rb')
            datatables = tablemagician.from_file_object(f, filename)
            for dt in datatables:
                analyser_table = dt.process(max_lines=MAX_LINES)
                for i, h in enumerate(analyser_table.headers):
                    if d == h.lower().strip():
                        train_set.append({'values': analyser_table.columns[i], 'name': filename})
            f.close()
        except Exception as e:
            print(filename + ' - ' + str(e))
            print(traceback.format_exc())
    with open(join(path, d + '.pkl'), 'wb') as handle:
        pickle.dump(train_set, handle)
Пример #8
0
def main():
    args = arg_parser()

    rootdir = args.in_dir
    outdir = args.out_dir

    if args.filter:
        filter_out_files(rootdir, outdir)

    num_of_rows = defaultdict(int)
    num_of_columns = defaultdict(int)
    header = defaultdict(int)
    types = defaultdict(int)
    column_types = defaultdict(int)

    start = time.time()
    row_counting_time = 0

    i = 0
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            try:
                filename = os.path.join(rootdir, file)

                # unzip files
                if filename.endswith('.gz'):

                    f = gzip.open(filename, 'rb')
                    # count all lines
                    start_row_counting = time.time()
                    number_lines = sum(1 for _ in f)
                    end_row_counting = time.time()
                    row_counting_time += end_row_counting - start_row_counting
                    num_of_rows[number_lines] += 1
                    f.close()


                    # open again after iteration
                    f = gzip.open(filename, 'rb')
                    # read first rows of table
                    tables = tablemagician.from_file_object(f, filename)
                    for table in tables:
                        #analyser_table = table.process(max_lines=MAX_ROWS)

                        # columns
                        num_of_columns[len(table.headers)] += 1
                        # header
                        for h in table.headers:
                            header[h.lower().strip()] += 1

                        table_type = type_classification([t.result_type for t in table.types], column_types)
                        types[table_type] += 1
                    table.close()
                    i += 1
            except Exception as e:
                traceback.print_exc()
                print e

            if i % 100 == 0:
                print i, 'files processed'

    with open('rows.csv', 'wb') as f:
        f.write('rows,count\n')
        for r in sorted(num_of_rows, key=num_of_rows.get, reverse=True):
            f.write(str(r) + ',' + str(num_of_rows[r]) + '\n')

    with open('columns.csv', 'wb') as f:
        f.write('columns,count\n')
        for r in sorted(num_of_columns, key=num_of_columns.get, reverse=True):
            f.write(str(r) + ',' + str(num_of_columns[r]) + '\n')

    with io.open('header.csv', 'wt') as f:
        f.write(u'header,count\n')
        for r in sorted(header, key=header.get, reverse=True):
            try:
                f.write(unicode(r) + ',' + unicode(header[r]) + u'\n')
            except Exception as e:
                print e

    with open('types.csv', 'wb') as f:
        f.write('type,count\n')
        for r in sorted(types, key=types.get, reverse=True):
            f.write(str(r) + ',' + str(types[r]) + '\n')

    with open('column_types.csv', 'wb') as f:
        f.write('column_type,count\n')
        for r in sorted(column_types, key=column_types.get, reverse=True):
            f.write(str(r) + ',' + str(column_types[r]) + '\n')

    end = time.time()
    print 'total time:', end - start
    print 'row counting time:', row_counting_time
Пример #9
0
if __name__ == '__main__':
    path = sys.argv[1]
    headers = sys.argv[2:]

    train_sets = {}
    for h in headers:
        with open(join(path, h + '.pkl'), 'rb') as handle:
            train_sets[h] = pickle.load(handle)

    data = build_data_frame(train_sets)
    data = data.reindex(numpy.random.permutation(data.index))
    pipeline = Pipeline([
        ('count_vectorizer',   CountVectorizer(ngram_range=(1, 2))),
        ('classifier',         MultinomialNB())
    ])

    pipeline.fit(data['text'].values, data['class'].values)

    # build example data
    # country
    ex = 'https%3A%2F%2Fcommondatastorage.googleapis.com%2Fckannet-storage%2F2012-04-17T212635%2Fcommitments-export-final.csv.gz'
    with gzip.open(join('/home/sebastian/csv_stats/bandersnatch', ex)) as handle:
        dts = tablemagician.from_file_object(handle)
        for dt in dts:
            at = dt.process(max_lines=100)
            col_str = ' '.join([str(c.value) for c in at.columns[at.headers.index('Country')] if c.value])
            print col_str

    predictions = pipeline.predict([col_str])
    print 'prediction'
    print predictions # [1, 0]