Пример #1
0
def from_csv_catalog(catalog_base):
    from csvcatalog import CSVCatalog
    c = CSVCatalog(catalog_base)

    profilers = [
        ColumnPatternProfiler(),
        DataTypeDetection(),
        ColumnStatsProfiler()
    ]  # FDProfiler(), ,ColumnStatsProfiler(), DataTypeDetection()]#,XSDTypeDetection()] #,ColumnRegexProfiler()]#,XSDTypeDetection()],,ColumnStatsProfiler()

    cnt = 0
    for uri_info in c.get_uris():
        print uri_info
        csv_file = uri_info['disk_location']
        if uri_info['exception'] is None:
            print "none"
        uri = uri_info['uri']
        cnt += 1
        print "{}, {} -> {}".format(cnt, uri, csv_file)
        try:
            from pyyacp import YACParser

            yacp = YACParser(filename=csv_file,
                             sample_size=1800,
                             structure_detector=AdvanceStructureDetector())
            table = datatable.parseDataTables(yacp, url=uri)

            profilers = [
                ColumnByCellProfilerSet([
                    ColumnPatternProfiler, ColumnStatsProfiler,
                    CharacterDistributionProfiler, BenfordsLawDistribution
                ]),
                ColumnProfilerSet([DataTypeDetection, DataTypeInterpretation])
            ]
            apply_profilers(table, profilers=profilers)

            to_html(table, cnt, dir='.')

        except Exception as e:
            print(traceback.format_exc())
            print(sys.exc_info()[0])
            print e
        print 'next'
Пример #2
0
class TestDescriptionDetectionAdvanceStructureDetector(unittest.TestCase):
    def setUp(self):
        self.structure_detector = AdvanceStructureDetector()
        self.verbose = True

    def test_single_col_no_header(self):
        table = [['City'], ['Vienna'], ['Salzburg']]
        self.assertListEqual([],
                             self.structure_detector.guess_headers(
                                 table, verbose=self.verbose))

    def test_single_col_one_header(self):
        table = [['Name'], ['Tim Tom'], ['Max Min']]
        self.assertListEqual([table[0]],
                             self.structure_detector.guess_headers(
                                 table, verbose=self.verbose))

    def test_single_col_one_header1(self):
        table = [['Vor Nachname'], ['Tim Tom'], ['Max Min']]
        self.assertListEqual([],
                             self.structure_detector.guess_headers(
                                 table, verbose=self.verbose))

    def test_single_col_one_header2(self):
        table = [['Count'], ['10'], ['111']]
        self.assertListEqual([table[0]],
                             self.structure_detector.guess_headers(
                                 table, verbose=self.verbose))

    def test_multi_header(self):
        table = [
            ['Einwohner'],
            ['Population'],
            ['1799'],
        ]
        self.assertListEqual(
            table[0:2],
            self.structure_detector.guess_headers(table, verbose=self.verbose))

    def test_col1_h1_mixed(self):
        table = [
            ['Population', 'City', 'Country'],
            ['1799', 'Vienna', 'Austria'],
            ['1799', 'Salzburg', 'Austria'],
        ]
        self.assertListEqual(
            table[0:1],
            self.structure_detector.guess_headers(table, verbose=self.verbose))
Пример #3
0
def from_csv_iter(portalID='data_wu_ac_at'):
    profilers = [
        ColumnPatternProfiler(),
        DataTypeDetection(),
        ColumnStatsProfiler()
    ]  #FDProfiler(), ,ColumnStatsProfiler(), DataTypeDetection()]#,XSDTypeDetection()] #,ColumnRegexProfiler()]#,XSDTypeDetection()],,ColumnStatsProfiler()
    cnt = 0
    for uri, csv_file in csvContent_iter(portalID):
        cnt += 1
        print "{}, {} -> {}".format(cnt, uri, csv_file)
        try:
            from pyyacp import YACParser

            yacp = YACParser(filename=csv_file,
                             sample_size=1800,
                             structure_detector=AdvanceStructureDetector())
            table = datatable.parseDataTables(yacp, url=uri)

            profilers = [
                ColumnByCellProfilerSet([
                    ColumnPatternProfiler, ColumnStatsProfiler,
                    CharacterDistributionProfiler, BenfordsLawDistribution
                ]),
                ColumnProfilerSet([DataTypeDetection, DataTypeInterpretation])
            ]
            apply_profilers(table, profilers=profilers)

            to_html(table, cnt, dir='.')

        except Exception as e:
            print(traceback.format_exc())
            print(sys.exc_info()[0])
            print e
        print 'next'
        if cnt > 10:
            break
Пример #4
0
# -*- coding: utf-8 -*-
import  pyyacp.datatable as datatable
from pyyacp.table_structure_helper import AdvanceStructureDetector

SAMPLES_PATH = "sample_csvs"
from os import listdir
from os.path import isfile, join
onlyfiles = [join(SAMPLES_PATH, f) for f in listdir(SAMPLES_PATH) if isfile(join(SAMPLES_PATH, f))]



for csv_file in onlyfiles:
    if 'multi_head_milti_table.csv' not in csv_file:
        continue

    from pyyacp import YACParser

    yacp = YACParser(filename=csv_file,structure_detector = AdvanceStructureDetector(),sample_size=1800)
    print yacp


    tables=datatable.parseDataTables(yacp, url='http://example.org/test', max_tables=10)

    for table in tables:
        print table.data.shape
        print table.data.head(5)


        print 'Comments', table.comments
        print 'Headers', table.header_rows
Пример #5
0
 def setUp(self):
     self.structure_detector = AdvanceStructureDetector()
     self.verbose = True
Пример #6
0
def parseDataTables(yacpParser,
                    url=None,
                    batches=80,
                    max_tables=1,
                    raiseError=True,
                    structure_detector=AdvanceStructureDetector()):
    yacpParser.seek_line(0)
    tables = []
    cur_dt = None
    groups = []

    def grouper(n, iterable):
        it = iter(iterable)
        while True:
            chunk = tuple(itertools.islice(it, n))
            if not chunk:
                return
            yield chunk

    rows = 0
    rows_to_add = []
    skipped = 0
    for g_rows in grouper(batches, yacpParser):
        rows += len(g_rows)

        # analys the shape of the rows
        r_len = map(len, g_rows)
        max_len = max(r_len)
        est_colNo = _most_common_oneliner(r_len)
        grouped_L = [(k, sum(1 for i in g))
                     for k, g in itertools.groupby(r_len)]

        groups += grouped_L

        if len(grouped_L) == 1:
            # perfect, one table in this batch
            if cur_dt is None:
                # we have no table, this is hte first
                comments = structure_detector.guess_description_lines(
                    list(g_rows))
                header = structure_detector.guess_headers(list(g_rows))
                cur_dt = DataTable(yacpParser.meta,
                                   est_colNo,
                                   comments=comments,
                                   headers=header,
                                   url=url,
                                   id=len(tables))

                pos = len(comments) + len(header)
                rows_to_add.extend(g_rows[pos:])
            elif max_len == cur_dt.no_cols:
                rows_to_add.extend(g_rows)
            else:
                # not the same length, maybe different table , should not happen
                log.warning("NOT IMPLEMENTED",
                            filename=yacpParser.url,
                            msg="not the same length, maybe different table")
        else:
            # lets go over the groups
            # (2,30) -> belongs to old table
            # (0,1)  -> empty line
            # (1,1)  -> comment line -> flag create_new
            # (4,20) -> belongs to new table -> create new table, start parsing at (1,1)

            cur_line = 0
            create_new = False
            for i, group in enumerate(grouped_L):

                if group[0] == 0:  # empty line, skip
                    skipped += 1
                    pass
                elif group[0] == 1 and group[1] < 3:
                    # there is a group with one element, that should be the comment lines
                    # also this means a new table
                    if i == len(grouped_L) - 1 and [
                            sum(x) for x in zip(*grouped_L)
                    ][1] < batches:
                        # print "SUFFIX COMMENT LINES"
                        log.warning("SUFFIX COMMENT LINES")
                    else:
                        # we have more groups to come, so lets start a new table from this line
                        if not create_new:
                            parse_start = cur_line
                        create_new = True
                else:
                    # a group with more than one column
                    start = None
                    if cur_dt is None or create_new:
                        start = cur_line
                        if create_new:
                            start = parse_start
                    elif group[0] == cur_dt.no_cols:
                        #cur_dt.addRows(g_rows[cur_line:group[1]])
                        rows_to_add.extend(g_rows[cur_line:cur_line +
                                                  group[1]])
                    else:
                        # seems like a new table
                        if group[1] != 1 or (
                                i == len(grouped_L) - 1
                                and [sum(x)
                                     for x in zip(*grouped_L)][1] == batches):
                            # more than one row
                            # OR at the end of the group and still a full batch
                            start = cur_line
                        else:
                            #print ("NOT TREATED", group[0])
                            # if only one row and (at the end of the file or in the middle of a group)
                            pass

                    if start is not None:
                        if cur_dt:
                            with Timer(key="adding {} rows".format(
                                    len(rows_to_add)),
                                       verbose=True):
                                cur_dt.addRows(rows_to_add)
                                rows_to_add = []
                            tables.append(cur_dt)

                        _rows = g_rows[start:]
                        comments = structure_detector.guess_description_lines(
                            _rows)
                        header = structure_detector.guess_headers(_rows)

                        cur_dt = DataTable(yacpParser.meta,
                                           group[0],
                                           comments=comments,
                                           headers=header,
                                           url=url,
                                           id=len(tables))

                        pos = len(comments) + len(header) + start
                        end = cur_line + group[1]

                        rows_to_add.extend(g_rows[pos:end])
                        create_new = False

                cur_line += group[1]

    cur_dt.addRows(rows_to_add)
    rows_to_add = []
    tables.append(cur_dt)

    prev_group = None
    agg_groups = []
    for group in groups:
        if prev_group is not None:
            if prev_group[0] == group[0]:
                # merge
                prev_group = (prev_group[0], prev_group[1] + group[1])
            else:
                agg_groups.append(prev_group)
                prev_group = group
        else:
            prev_group = group

    agg_groups.append(prev_group)
    log.info("TABLE SHAPE", groups=agg_groups, filename=url)

    if len(tables) > max_tables:
        if raiseError:
            raise YACParserException("Too many tables (#" + str(len(tables)) +
                                     ") shapes:" + str(agg_groups))

    log.info("Parsed table", skipped=skipped, tables=len(tables))

    if max_tables == 1:
        return tables[0]
    else:
        return tables
Пример #7
0
 def setUp(self):
     self.structure_detector = AdvanceStructureDetector()
Пример #8
0
    FDProfiler(),
    ColumnPatternProfiler(),
    ColumnStatsProfiler(),
    DataTypeDetection()
]  #,XSDTypeDetection()] #,ColumnRegexProfiler()]#,XSDTypeDetection()],,ColumnStatsProfiler()
cnt = 0
t = []
for uri, csv_file in csvContent_iter(portalID):
    cnt += 1
    print "{}, {} -> {}".format(cnt, uri, csv_file)
    try:
        from pyyacp import YACParser

        yacp = YACParser(filename=csv_file,
                         sample_size=1800,
                         structure_detector=AdvanceStructureDetector())
        table = datatable.parseDataTables(yacp, url=uri)
        table.apply_profiler(profilers=profilers)

        print ">>>>>TABLE{}".format(cnt)

        print('_' * 80)
        print 'Headers', table.header_rows
        print '_' * 30, 'DATA {}'.format(table.data.shape), '_' * 30
        print '_' * 30, 'META', '_' * 30
        for k in table.meta:
            print '[{}] {} '.format(k, table.meta[k])
        print table.colum_profiles()

        t.append(table)
    except Exception as e: