def replace_split_transactions(table, dict_split_transaction_details):
    global g

    list_rows_to_add = []
    list_rows_to_remove = []
    for row in petl.records(table):
        if row['Account'] == 'Split Transaction':
            sk_indiv_id = row['SK Individual ID']
            batch_date = row['Batch Date']
            amount_str = row['SK Amount']
            string_key = sk_indiv_id + ',' + batch_date
            contrib_amount = Decimal(re.sub(r'[^\d.]', '', amount_str))
            if string_key in dict_split_transaction_details:
                splits_total = Decimal(0)
                for split_entry in dict_split_transaction_details[string_key]:
                    splits_total += Decimal(re.sub(r'[^\d.]', '', split_entry['Amount']))
                if contrib_amount != splits_total:
                    print "*** ERROR!  For Individual ID, Batch Date " + string_key + ", the main 'Split " \
                        "Transaction' entry amount was " + str(contrib_amount) + "but sum of split detail " \
                        "transactions was " + str(splits_total)
                else:
                    list_rows_to_remove.append({
                        'SK Individual ID': sk_indiv_id,
                        'Batch Date': batch_date,
                        'SK Amount': amount_str,
                        'Account': 'Split Transaction'
                        })
                    for split_entry in dict_split_transaction_details[string_key]:
                        list_rows_to_add.append({
                            'Env #': row['Env #'],
                            'Batch Date': batch_date,
                            'SK Amount': split_entry['Amount'],
                            'Type': row['Type'],
                            'Account': split_entry['Account'],
                            'Tax': split_entry['Tax'],
                            'Check #': row['Check #'],
                            'Notes': "Inserted from 'Split Transaction'. " + row['Notes'],
                            # 'Family ID': row['Family ID'],
                            'SK Individual ID': row['SK Individual ID']
                            # 'To Date': row['To Date'],
                            # 'Contribution Link': row['Contribution Link']
                            })
            else:
                print "*** ERROR!  Cannot find any 'Split Transaction' details for record with 'Batch Date' " + \
                    row['Batch Date'] + ", contributed by 'Individual ID' " + row['SK Individual ID'] + " for the " \
                    "amount " + row['SK Amount']

    print '*** Count before remove_rows(): ' + str(petl.nrows(table))
    table = remove_rows(table, list_rows_to_remove)
    print '*** Count after remove_rows(): ' + str(petl.nrows(table))

    print '*** Count before add_rows(): ' + str(petl.nrows(table))
    table = add_rows(table, list_rows_to_add)
    print '*** Count after add_rows(): ' + str(petl.nrows(table))

    return table
示例#2
0
def sync(ctx: typer.Context,
         project: str = typer.Argument(
             ..., help='The name for the project, specified in config file'),
         since: datetime = typer.Option(..., formats=['%Y-%m-%d']),
         until: datetime = typer.Option(..., formats=['%Y-%m-%d']),
         dry: bool = typer.Option(
             False,
             help='Use log entries instead of uploading them to redmine'),
         drain: bool = typer.Option(
             False,
             help='Use drain issues for entries without specified dest')):
    config = setup_config(ctx, ctx.meta['config_path'])
    setup_http(ctx)

    ctx.meta['rdm_user'] = extract.get_redmine_user(config["redmine"]["url"])

    time_entries = get_toggl_enteries(config, project, since, until)

    issues = get_redmine_issues(config, project, since)

    issue_ids = petl.columns(issues)['id']
    entries_to_load, unset_entries = petl.biselect(
        time_entries, lambda row: row['issue_id'] in issue_ids)

    if drain and petl.nrows(unset_entries):
        log.info('Using drain')

        drained, unset_entries = drained_entries(ctx, issues, unset_entries,
                                                 project)

        log.info(f'Drained {petl.nrows(drained)} issues')

        entries_to_load = petl.cat(entries_to_load, drained)

    if petl.nrows(unset_entries):
        log.warning(f'There\'re {petl.nrows(unset_entries)} unset entries')

    if get_proj_attr(config, project, 'group_entries'):
        log.info('Using group by day and description')

        entries_to_load = transform.group_entries_by_day(entries_to_load)

    load.to_redmine_time(config["redmine"]["url"],
                         entries_to_load,
                         activity_id=get_proj_attr(config, project,
                                                   'rdm_activity_id'),
                         user_id=ctx.meta['rdm_user'].get('id'),
                         dry=dry)
示例#3
0
 def num_rows(self):
     """
     `Returns:`
         int
             Number of rows in the table
     """
     return petl.nrows(self.table)
示例#4
0
文件: table.py 项目: tiburona/parsons
    def __bool__(self):

        # Try to get a single row from our table
        head_one = petl.head(self.table)

        # See if our single row is empty
        return petl.nrows(head_one) > 0
示例#5
0
def display_store():
    response.headers['Access-Control-Allow-Origin'] = '*'
    response.headers['Content-type'] = 'application/json'
    table = (
        etl
        .fromcsv('store_locations.csv')
        .convert('Lat', float)
        .convert('Lon', float)
    )
    store_id = request.query.postcode

    # Select rows
    table1 = etl.select(table, "{Postcode}=='" + store_id + "'")

    # Set default postcode of 2000 
    if etl.nrows(table1) == 0:
        defaultPostCode = "2000"
        table1 = etl.select(table, "{Postcode}=='" + defaultPostCode + "'")

    # Reorder fields
    print(table1)
    table2 = etl.cut(table1, 'Name', 'Lat', 'Lon').dicts()[0]

    print(table2)
    return table2
示例#6
0
def test_different_days():
    data = [
        ['dur', 'description', 'start'],
        [timedelta(), 'test', datetime(2000, 1, 1, 15)],
        [timedelta(), 'test', datetime(2000, 1, 2, 15)],
    ]

    result = group_entries_by_day(data)
    assert petl.nrows(result) == 2
示例#7
0
def valuecounts(table, col_name):
    return_dict = {}
    reported_count = 0
    unreported_count = 0
    column = petl.values(table, col_name)
    nrows = petl.nrows(table)
    non_blanks = petl.select(table, '{' + quote_single_quote(col_name) + "} != ''")
    num_blanks = nrows - petl.nrows(non_blanks)
    counts_table = petl.valuecounts(non_blanks, col_name)
    for row in petl.records(counts_table):
        if row['frequency'] > 0.01:
            return_dict[row[col_name]] = row['count']
            reported_count += row['count']
        else:
            unreported_count += row['count']
    return_dict['<other>'] = unreported_count
    return_dict['<blank>'] = num_blanks
    return return_dict
示例#8
0
def test_same_day():
    data = [
        ['dur', 'description', 'start'],
        [timedelta(), 'test',
         datetime(2000, 1, 1, 15, 10)],
        [timedelta(), 'test', datetime(2000, 1, 1, 15, 0)],
    ]

    result = group_entries_by_day(data)
    assert petl.nrows(result) == 1
示例#9
0
def valuecounts(table, col_name):
    return_dict = {}
    reported_count = 0
    unreported_count = 0
    column = petl.values(table, col_name)
    nrows = petl.nrows(table)
    non_blanks = petl.select(table,
                             '{' + quote_single_quote(col_name) + "} != ''")
    num_blanks = nrows - petl.nrows(non_blanks)
    counts_table = petl.valuecounts(non_blanks, col_name)
    for row in petl.records(counts_table):
        if row['frequency'] > 0.01:
            return_dict[row[col_name]] = row['count']
            reported_count += row['count']
        else:
            unreported_count += row['count']
    return_dict['<other>'] = unreported_count
    return_dict['<blank>'] = num_blanks
    return return_dict
示例#10
0
    def store_to_db(self, conn, tablename, data):
        try:
            if etl.nrows(data) == 0:
                return None
        except TypeError:
            return None

        cursor = conn.cursor()
        sql = "INSERT INTO %s (%s) " % (tablename, ','.join(
            etl.header(data))) + "VALUES %s"
        execute_values(cursor, sql, etl.data(data))
        conn.commit()
        conn.close()
示例#11
0
def vehicleP():
    while True:
        feed.ParseFromString(
            urlopen(
                'http://gtfs.openov.nl/gtfs-rt/vehiclePositions.pb').read())
        data = []

        timer1 = datetime.now()
        timer2 = datetime.now() - timedelta(minutes=1)
        for entity in feed.entity:
            vp = entity.vehicle
            timex = datetime.fromtimestamp(vp.timestamp)
            if timex < timer1 and timex > timer2:
                x = vp.position.longitude
                y = vp.position.latitude
                time = datetime.fromtimestamp(vp.timestamp)
                geo = shape.from_shape(Point(x, y), srid=4326)
                schedule_relationship = vp.trip.schedule_relationship,
                direction_id = vp.trip.direction_id,
                current_stop_sequence = vp.current_stop_sequence,
                current_status = vp.current_status,
                trip_id = vp.trip.trip_id,
                route_id = vp.trip.route_id,
                stop_id = vp.stop_id,
                # trip_start_time = datetime.strptime(vp.trip.start_time, '%H:%M:%S').time(),
                # trip_start_date = datetime.strptime(vp.trip.start_date, "%d%m%Y").date(),
                trip_start_time = vp.trip.start_time,
                trip_start_date = vp.trip.start_date,
                vehicle_label = vp.vehicle.label,

            data.append({
                'time': time,
                'geo_loc': str(geo),
                'schedule_relationship': vp.trip.schedule_relationship,
                'direction_id': direction_id,
                'current_stop_sequence': current_stop_sequence,
                'current_status': current_status,
                'trip_id': trip_id,
                'route_id': route_id,
                'stop_id': stop_id,
                'trip_start_time': trip_start_time,
                'trip_start_date': trip_start_date,
                'vehicle_label': vehicle_label,
            })

        table1 = petl.fromdicts(data)
        print(petl.nrows(table1))
        petl.appenddb(table1, con, 'vehicle_positions')
        t.sleep(60)
示例#12
0
    def empty_column(self, column):
        """
        Checks if a given column is empty. Returns ``True`` if empty and ``False``
        if not empty.

        `Args:`
            column: str
                The column name
        `Returns:`
            bool
        """

        if petl.nrows(petl.selectnotnone(self.table, column)) == 0:
            return True
        else:
            return False
示例#13
0
    def etl(self, record_id):
        for model in self.__models__:
            location_table = self.extract(model, record_id)
            nrows = petl.nrows(location_table)

            if nrows == 1:
                record = petl.dicts(location_table)[0]
                if self._has_observations(record):
                    self._added = True
                    location_id = self._post_location(record, model)
                    thing_id = self._post_thing(record, model, location_id)

                    self.add_package(record)
                    self.observation.etl(tids=self._make_tids(thing_id, record),
                                         models=(model,))
            else:
                print(f'multipe records found for given record_id. Skipping {record_id}')
示例#14
0
def get_toggl_enteries(config, project, since, until):
    if project not in config['project']:
        log.error('No such project in config')
        raise typer.Exit(code=1)
    else:
        project_cfg = config['project'][project]
    time_entries = extract.from_toggl_timeenteries(
        workspace=config['toggl']['workspace_id'],
        projects=project_cfg['tgl_project_id'],
        since=since.date(),
        until=until.date())
    nrows = petl.nrows(time_entries)
    if nrows == 0:
        log.info('No entries found')
        raise typer.Exit()
    time_entries = transform.parse_datetime(time_entries,
                                            ['start', 'end', 'updated'])
    time_entries = transform.parse_duration(time_entries)
    time_entries = transform.add_issue_id_from_description(time_entries)
    return time_entries
示例#15
0
    def get_context_data(self, **kwargs):
        context = super().get_context_data(**kwargs)
        context["filename"] = self.object.downloaded_file.name.split(os.path.sep)[-1]
        context["count_query_kwarg"] = self.count_query_kwarg

        table = petl.fromcsv(self.object.downloaded_file)
        context["header"] = petl.header(table)

        try:
            record_count_to_show = int(self.request.GET.get(self.count_query_kwarg))
        except (TypeError, ValueError):
            record_count_to_show = self.count_increment

        # Potentially expensive, cache / save in database for dataset
        if petl.nrows(table) > record_count_to_show:
            context[
                "load_more_url"
            ] = f"{self.request.path}?{self.count_query_kwarg}={record_count_to_show+self.count_increment}"

        context["rows"] = petl.records(petl.head(table, record_count_to_show))

        return context
def tsv_fix(base_path, new_file_name, pk_list, illegal_columns_lower_case,
            tsv_process):
    if tsv_process:
        pwb_replace_in_file(new_file_name, '\0', '')  # Remove null bytes

    table = etl.fromcsv(new_file_name,
                        delimiter='\t',
                        skipinitialspace=True,
                        quoting=csv.QUOTE_NONE,
                        quotechar='',
                        escapechar='')

    row_count = etl.nrows(table)

    if tsv_process:
        tempfile = NamedTemporaryFile(mode='w',
                                      dir=base_path + "/content/data/",
                                      delete=False)

        table = pwb_lower_case_header(table)
        table = etl.rename(table, illegal_columns_lower_case, strict=False)

        print(new_file_name)
        for pk in pk_list:
            table = etl.convert(table, pk.lower(), lambda a: a
                                if len(str(a)) > 0 else '-')

        writer = csv.writer(tempfile,
                            delimiter='\t',
                            quoting=csv.QUOTE_NONE,
                            quotechar='',
                            escapechar='',
                            lineterminator='\n')
        writer.writerows(table)

        shutil.move(tempfile.name, new_file_name)
    return row_count
示例#17
0
    def export_process(self):
        total = petl.nrows(self.data)
        if self.data and total > 0:
            mappings = OrderedDict()
            mappings['Location'] = 'location'
            mappings['Position'] = 'position', lambda rec: construct_postion(
                rec)
            mappings[
                'Local Time'] = 'time', lambda rec: datetime.fromtimestamp(
                    rec).strftime("%Y-%m-%d %H:%M:%S")
            mappings[
                'Conditions Time'] = 'summary', lambda rec: populate_summary(
                    rec)
            mappings['Temperature'] = 'temperature'
            mappings['Pressure'] = 'pressure'
            mappings['Humidity'] = 'humidity', lambda rec: int(rec * 100)
            self.data = petl.fieldmap(self.data, mappings)
            return True

        else:
            print(
                'Data store doesnt have historic Data. Please run import and export Job'
            )
            return False
示例#18
0
"""
DB-related tests, separated from main unit tests because they need local database
setup prior to running.

"""

import sys
sys.path.insert(0, './src')
from petl import dummytable, sort, nrows
import logging
logging.basicConfig(level=logging.DEBUG)

t = (('foo', 'bar'), ('C', 2), ('A', 9), ('B', 6), ('E', 1), ('D', 10))
u = sort(t, buffersize=3)

print 'buffer up the data'
print nrows(u)

print 'create iterators'
it1 = iter(u)
it2 = iter(u)

print 'iterate'
print 1, it1.next()
print 1, it1.next()
print 1, it1.next()
print 2, it2.next()
print 2, it2.next()
print 1, it1.next()
print 1, it1.next()
示例#19
0
def main(argv):

    parser = argparse.ArgumentParser()
    parser.add_argument("--input-csv-filename",
                        required=True,
                        help="Input UTF8 CSV to summarize")
    parser.add_argument(
        "--sep-columns",
        required=False,
        nargs='*',
        default=argparse.SUPPRESS,
        help=
        "Column names of columns containing comma- or semi-colon-separated values"
    )
    parser.add_argument("--sep-character", required=False, help="Character used to separate values in multi-value " \
        "fields.  Defaults to ';' if not specified.")
    parser.add_argument("--skip-columns",
                        required=False,
                        nargs='*',
                        default=argparse.SUPPRESS,
                        help="Column names to NOT generate stats for")
    parser.add_argument("--skip-num-rows",
                        required=False,
                        type=int,
                        help="Skip specified number "
                        "of header rows")
    parser.add_argument(
        "--first-ccb-column",
        required=False,
        help="String name of first CCB column.  If "
        "specified, all preceeding columns will be labeled 'Servant Keeper' and this column "
        "and all subsequent will be labeled 'CCB'")
    args = parser.parse_args()

    if args.first_ccb_column is not None:
        column_prefix = 'Servant Keeper '
    else:
        column_prefix = ''

    assert os.path.isfile(
        args.input_csv_filename
    ), "Error: cannot open file '" + args.input_csv_filename + "'"

    table = petl.fromcsv(args.input_csv_filename)

    # Skip header rows
    if args.skip_num_rows:
        skip_num = args.skip_num_rows
        assert skip_num > 0, "--skip-num-rows value '" + str(
            skip_num) + "' is invalid.  Must be positive."
        it = iter(table)
        while skip_num >= 0:
            row = next(it)
            skip_num -= 1
        table = petl.setheader(table, row)
        table = petl.tail(table, petl.nrows(table) - args.skip_num_rows)

    # Print nicely formatted stats for each column
    sep = ''
    args_dict = vars(args)
    skip_columns_specified = 'skip_columns' in args_dict
    sep_char_specified = 'sep_character' in args_dict
    for column in petl.header(table):
        if args.first_ccb_column is not None and column == args.first_ccb_column:
            column_prefix = 'CCB '
        if not skip_columns_specified or column not in args.skip_columns:
            output_str = column_prefix + "Column '" + column + "'"
            print sep + output_str
            print >> sys.stderr, output_str
            if args.sep_columns is not None and column in args.sep_columns:
                if sep_char_specified:
                    sep_character = args.sep_character
                else:
                    sep_character = ';'
                output_str = num_dict2str(
                    dict_dump(sep_valuecounter(table, column, sep_character)))
                print output_str
            else:
                output_str = num_dict2str(dict_dump(valuecounts(table,
                                                                column)))
                print output_str
        sep = '\n'

    # Flush to ensure all output is written
    sys.stdout.flush()
    sys.stderr.flush()
示例#20
0
import sys
sys.path.insert(0, './src')
from petl import dummytable, sort, nrows
import logging
logging.basicConfig(level=logging.DEBUG)

t = (('foo', 'bar'),
     ('C', 2),
     ('A', 9),
     ('B', 6),
     ('E', 1),
     ('D', 10))
u = sort(t, buffersize=3)

print 'buffer up the data'
print nrows(u)

print 'create iterators'
it1 = iter(u)
it2 = iter(u)

print 'iterate'
print 1, it1.next()
print 1, it1.next()
print 1, it1.next()
print 2, it2.next()
print 2, it2.next()
print 1, it1.next()
print 1, it1.next()
示例#21
0
def convert_folder(base_source_dir,
                   base_target_dir,
                   tmp_dir,
                   tika=False,
                   ocr=False,
                   merge=False,
                   tsv_source_path=None,
                   tsv_target_path=None,
                   make_unique=True,
                   sample=False,
                   zip=False):
    # WAIT: Legg inn i gui at kan velge om skal ocr-behandles
    txt_target_path = base_target_dir + '_result.txt'
    json_tmp_dir = base_target_dir + '_tmp'
    converted_now = False
    errors = False
    originals = False

    if merge is False:  # TODO: Trengs begge argumentene?
        make_unique = False

    if tsv_source_path is None:
        tsv_source_path = base_target_dir + '.tsv'
    else:
        txt_target_path = os.path.splitext(
            tsv_source_path)[1][1:] + '_result.txt'

    if tsv_target_path is None:
        tsv_target_path = base_target_dir + '_processed.tsv'

    if os.path.exists(tsv_target_path):
        os.remove(tsv_target_path)

    Path(base_target_dir).mkdir(parents=True, exist_ok=True)

    # TODO: Viser mime direkte om er pdf/a eller må en sjekke mot ekstra felt i de to under? Forsjekk om Tika og siegfried?

    # TODO: Trengs denne sjekk om tsv her. Gjøres sjekk før kaller denne funskjonen og slik at unødvendig?
    if not os.path.isfile(tsv_source_path):
        if tika:
            run_tika(tsv_source_path, base_source_dir, json_tmp_dir, zip)
        else:
            run_siegfried(base_source_dir, tmp_dir, tsv_source_path, zip)

    # TODO: Legg inn test på at tsv-fil ikke er tom
    replace_text_in_file(tsv_source_path, '\0', '')

    table = etl.fromtsv(tsv_source_path)
    table = etl.rename(table, {
        'filename': 'source_file_path',
        'tika_batch_fs_relative_path': 'source_file_path',
        'filesize': 'file_size',
        'mime': 'mime_type',
        'Content_Type': 'mime_type',
        'Version': 'version'
    },
                       strict=False)

    thumbs_table = etl.select(
        table, lambda rec: Path(rec.source_file_path).name == 'Thumbs.db')
    if etl.nrows(thumbs_table) > 0:
        thumbs_paths = etl.values(thumbs_table, 'source_file_path')
        for path in thumbs_paths:
            if '/' not in path:
                path = os.path.join(base_source_dir, path)
            if os.path.isfile(path):
                os.remove(path)

        table = etl.select(
            table, lambda rec: Path(rec.source_file_path).name != 'Thumbs.db')

    table = etl.select(table, lambda rec: rec.source_file_path != '')
    table = etl.select(table, lambda rec: '#' not in rec.source_file_path)
    # WAIT: Ikke fullgod sjekk på embedded dokument i linje over da # faktisk kan forekomme i filnavn
    row_count = etl.nrows(table)

    file_count = sum([len(files) for r, d, files in os.walk(base_source_dir)])

    if row_count == 0:
        print('No files to convert. Exiting.')
        return 'Error', file_count
    elif file_count != row_count:
        print('Row count: ' + str(row_count))
        print('File count: ' + str(file_count))
        print("Files listed in '" + tsv_source_path +
              "' doesn't match files on disk. Exiting.")
        return 'Error', file_count
    elif not zip:
        print('Converting files..')

    # WAIT: Legg inn sjekk på filstørrelse før og etter konvertering

    append_fields = ('version', 'norm_file_path', 'result',
                     'original_file_copy', 'id')
    table = add_fields(append_fields, table)

    cut_fields = ('0', '1', 'X_TIKA_EXCEPTION_runtime',
                  'X_TIKA_EXCEPTION_warn')
    table = remove_fields(cut_fields, table)

    header = etl.header(table)
    append_tsv_row(tsv_target_path, header)

    # Treat csv (detected from extension only) as plain text:
    table = etl.convert(table,
                        'mime_type',
                        lambda v, row: 'text/plain'
                        if row.id == 'x-fmt/18' else v,
                        pass_row=True)

    # Update for missing mime types where id is known:
    table = etl.convert(table,
                        'mime_type',
                        lambda v, row: 'application/xml'
                        if row.id == 'fmt/979' else v,
                        pass_row=True)

    if os.path.isfile(txt_target_path):
        os.remove(txt_target_path)

    data = etl.dicts(table)
    count = 0
    for row in data:
        count += 1
        count_str = ('(' + str(count) + '/' + str(file_count) + '): ')
        source_file_path = row['source_file_path']
        if '/' not in source_file_path:
            source_file_path = os.path.join(base_source_dir, source_file_path)

        mime_type = row['mime_type']
        # TODO: Virker ikke når Tika brukt -> finn hvorfor
        if ';' in mime_type:
            mime_type = mime_type.split(';')[0]

        version = row['version']
        result = None
        old_result = row['result']

        if not mime_type:
            if os.path.islink(source_file_path):
                mime_type = 'n/a'

            # kind = filetype.guess(source_file_path)
            extension = os.path.splitext(source_file_path)[1][1:].lower()
            if extension == 'xml':
                mime_type = 'application/xml'

        if not zip:
            print_path = os.path.relpath(source_file_path,
                                         Path(base_source_dir).parents[1])
            print(count_str + '.../' + print_path + ' (' + mime_type + ')')

        if mime_type not in mime_to_norm.keys():
            # print("|" + mime_type + "|")

            errors = True
            converted_now = True
            result = 'Conversion not supported'
            append_txt_file(
                txt_target_path,
                result + ': ' + source_file_path + ' (' + mime_type + ')')
            row['norm_file_path'] = ''
            row['original_file_copy'] = ''
        else:
            keep_original = mime_to_norm[mime_type][0]

            if keep_original:
                originals = True

            if zip:
                keep_original = False

            function = mime_to_norm[mime_type][1]

            # Ensure unique file names in dir hierarchy:
            norm_ext = mime_to_norm[mime_type][2]
            if not norm_ext:
                norm_ext = 'none'

            if make_unique:
                norm_ext = (base64.b32encode(
                    bytes(
                        str(count), encoding='ascii'))).decode('utf8').replace(
                            '=', '').lower() + '.' + norm_ext
            target_dir = os.path.dirname(
                source_file_path.replace(base_source_dir, base_target_dir))
            normalized = file_convert(source_file_path,
                                      mime_type,
                                      function,
                                      target_dir,
                                      tmp_dir,
                                      None,
                                      norm_ext,
                                      version,
                                      ocr,
                                      keep_original,
                                      zip=zip)

            if normalized['result'] == 0:
                errors = True
                result = 'Conversion failed'
                append_txt_file(
                    txt_target_path,
                    result + ': ' + source_file_path + ' (' + mime_type + ')')
            elif normalized['result'] == 1:
                result = 'Converted successfully'
                converted_now = True
            elif normalized['result'] == 2:
                errors = True
                result = 'Conversion not supported'
                append_txt_file(
                    txt_target_path,
                    result + ': ' + source_file_path + ' (' + mime_type + ')')
            elif normalized['result'] == 3:
                if old_result not in ('Converted successfully',
                                      'Manually converted'):
                    result = 'Manually converted'
                    converted_now = True
                else:
                    result = old_result
            elif normalized['result'] == 4:
                converted_now = True
                errors = True
                result = normalized['error']
                append_txt_file(
                    txt_target_path,
                    result + ': ' + source_file_path + ' (' + mime_type + ')')
            elif normalized['result'] == 5:
                result = 'Not a document'

            if normalized['norm_file_path']:
                row['norm_file_path'] = relpath(normalized['norm_file_path'],
                                                base_target_dir)

            file_copy_path = normalized['original_file_copy']
            if file_copy_path:
                file_copy_path = relpath(file_copy_path, base_target_dir)
            row['original_file_copy'] = file_copy_path

        row['result'] = result
        row_values = list(row.values())

        # TODO: Fikset med å legge inn escapechar='\\' i append_tsv_row -> vil det skal problemer senere?
        # row_values = [r.replace('\n', ' ') for r in row_values if r is not None]
        append_tsv_row(tsv_target_path, row_values)

        if sample and count > 9:
            break

    if not sample:
        shutil.move(tsv_target_path, tsv_source_path)
    # TODO: Legg inn valg om at hvis merge = true kopieres alle filer til mappe på øverste nivå og så slettes tomme undermapper

    msg = None
    if sample:
        msg = 'Sample files converted.'
        if errors:
            msg = "Not all sample files were converted. See '" + txt_target_path + "' for details."
    else:
        if converted_now:
            msg = 'All files converted succcessfully.'
            if errors:
                msg = "Not all files were converted. See '" + txt_target_path + "' for details."
        else:
            msg = 'All files converted previously.'

    return msg, file_count, errors, originals  # TODO: Fiks så bruker denne heller for oppsummering til slutt når flere mapper konvertert
示例#22
0
n_table = etl.fromdb(data_in, 'SELECT * FROM Names')
m_table = etl.fromdb(data_in, 'SELECT * FROM Movies')
r_table = etl.fromdb(data_in, 'SELECT * FROM Ratings')
tp_table = etl.fromdb(data_in, 'SELECT * FROM Title_principals')
print("Extract DONE")

# TRANSFORM
# movie personnel
d_movie_personnel = etl.cut(n_table, 'imdb_name_id', 'name', 'birth_name')

# title
d_title = etl.cut(m_table, 'imdb_title_id', 'title', 'original_title')

# genre
d_genre = etl.distinct(etl.cut(m_table, 'genre'))
rows = etl.nrows(d_genre)
generated =[] 
# print(rows)
for i in range(rows):
    uuid = out_cursor.execute('SELECT UUID();')
    uuid = out_cursor.fetchone()[0]
    generated.append(uuid)
d_genre = etl.addcolumn(d_genre, 'genre_id', generated)

# date
d_date = etl.distinct(etl.cut(m_table, 'year', 'date_published'))
rows = etl.nrows(d_date)
generated =[]
for i in range(rows):
    uuid = out_cursor.execute('SELECT UUID();')
    uuid = out_cursor.fetchone()[0]
示例#23
0
CLEAN_UP = 1
#print(etl.head(src_table, 1))


def debug(table, columns):
    if DEBUG:
        print(etl.cut(table, columns))


def clean_up(table, column):
    if CLEAN_UP:
        return etl.cutout(table, column)


table = src_table
print('TOTAL SOURCE ROWS = ' + str(etl.nrows(table)))
print('SOURCE HEADERS = ' + str(etl.header(table)))

#UNUSED COLUMNS
if CLEAN_UP:
    table = clean_up(table, 'rcv_nm')
    table = clean_up(table, 'recp_cd')
    table = clean_up(table, 'ins_ind')
    table = clean_up(table, 'geo_ind')
    table = clean_up(table, 'cid')
    table = clean_up(table, 'occ_typ')
    print('TRIMMED HEADERS = ' + str(etl.header(table)))

table = etl.select(table, 'occ_dt', lambda x: x > datetime(2000, 1, 1))
print('ROWS POST YR 2000 = ' + str(etl.nrows(table)))
示例#24
0
文件: test_io.py 项目: brutimus/petl
def test_fromxml_url():

    tbl = fromxml('http://feeds.bbci.co.uk/news/rss.xml', './/item', 'title')
    print tbl
    assert nrows(tbl) > 0
示例#25
0
def main(argv):

    parser = argparse.ArgumentParser()
    parser.add_argument("--input-csv-filename", required=True, help="Input UTF8 CSV to summarize")
    parser.add_argument("--sep-columns", required=False, nargs = '*', default=argparse.SUPPRESS,
        help="Column names of columns containing comma- or semi-colon-separated values")
    parser.add_argument("--sep-character", required=False, help="Character used to separate values in multi-value " \
        "fields.  Defaults to ';' if not specified.")
    parser.add_argument("--skip-columns", required=False, nargs='*', default=argparse.SUPPRESS,
        help="Column names to NOT generate stats for")
    parser.add_argument("--skip-num-rows", required=False, type=int, help="Skip specified number "
        "of header rows")
    parser.add_argument("--first-ccb-column", required=False, help="String name of first CCB column.  If "
        "specified, all preceeding columns will be labeled 'Servant Keeper' and this column "
        "and all subsequent will be labeled 'CCB'")
    args = parser.parse_args()

    if args.first_ccb_column is not None:
        column_prefix = 'Servant Keeper '
    else:
        column_prefix = ''

    assert os.path.isfile(args.input_csv_filename), "Error: cannot open file '" + args.input_csv_filename + "'"

    table = petl.fromcsv(args.input_csv_filename)

    # Skip header rows
    if args.skip_num_rows:
        skip_num = args.skip_num_rows
        assert skip_num > 0, "--skip-num-rows value '" + str(skip_num) + "' is invalid.  Must be positive."
        it = iter(table)
        while skip_num >= 0:
            row = next(it)
            skip_num -= 1
        table = petl.setheader(table, row)
        table = petl.tail(table, petl.nrows(table) - args.skip_num_rows)

    # Print nicely formatted stats for each column
    sep = ''
    args_dict = vars(args)
    skip_columns_specified = 'skip_columns' in args_dict
    sep_char_specified = 'sep_character' in args_dict
    for column in petl.header(table):
        if args.first_ccb_column is not None and column == args.first_ccb_column:
            column_prefix = 'CCB '
        if not skip_columns_specified or column not in args.skip_columns:
            output_str = column_prefix + "Column '" + column + "'"
            print sep + output_str
            print >> sys.stderr, output_str
            if args.sep_columns is not None and column in args.sep_columns:
                if sep_char_specified:
                    sep_character = args.sep_character
                else:
                    sep_character = ';'
                output_str = num_dict2str(dict_dump(sep_valuecounter(table, column, sep_character)))
                print output_str
            else:
                output_str = num_dict2str(dict_dump(valuecounts(table, column)))
                print output_str
        sep = '\n'

    # Flush to ensure all output is written
    sys.stdout.flush()
    sys.stderr.flush()
示例#26
0
    def etl(self, tids=None, models=None):
        if models is None:
            models = self.__models__

        self._sensor_id = self._add_sensor()

        self._observed_properties = {}
        # add the observed properties
        for m in models:
            payload = m.observed_property_payload
            payload['name'] = m.name
            self._observed_properties[m.name] = self._post_unique_item(
                'ObservedProperties', m.observed_property_payload)

        added = False
        for thing in tids:
            thing_id = thing['@iot.id']
            # point_id = thing['@nmbgmr.point_id']
            for m in models:
                print(f'Add {m.name}')
                ds_id = self._get_datastream(thing_id,
                                             m.datastream_payload['name'])

                skip_nobs = 0
                if ds_id:
                    print(f'Got datastream  thing={thing_id}, ds={ds_id}')
                    # check the number of obs for this datastream matches nrows
                    skip_nobs = self._get_nobservations(ds_id)
                    if skip_nobs:
                        print(f'Skipping nobs={skip_nobs}')

                wt = self._extract(thing, m, skip_nobs)
                if isinstance(wt, list):
                    nrows = len(wt)
                else:
                    nrows = petl.nrows(wt)

                if nrows:
                    added = True
                    print(f'Adding nobs={nrows}')
                    if not ds_id:
                        print('Adding datastream')

                        sensor_id = self._sensor_id
                        if self._sensor_id is None:
                            # extract the sensor from the first record
                            sensor_id = self._add_sensor(wt)

                        ds_id = self._add_datastream(
                            thing_id, self._observed_properties[m.name],
                            sensor_id, m.datastream_payload)

                        # r = self._make_resource(m)
                        # self.ckan_importer.add_resource(r)

                    # add observations to datastream
                    self._add_observations(ds_id, wt, m)
                else:
                    print('no obs to add')

        return added
示例#27
0
def test_fromxml_url():

    tbl = fromxml('http://feeds.bbci.co.uk/news/rss.xml', './/item', 'title')
    print tbl
    assert nrows(tbl) > 0
示例#28
0
文件: views.py 项目: sjyk/activeclean
def typeInference(table):
	for h in etl.header(table):
		col =  etl.cut(table, h)
		print etl.nrows(col)
示例#29
0
    street_name, street_suffix, street_postdir, street_full from {dor_parcel_table}
    '''.format(dor_parcel_table='dor_parcel')
engine_dor_parcel_rows = etl.fromdb(pg_db, dor_parcel_read_stmt)
if DEV:
    print(etl.look(engine_dor_parcel_rows))

# Get duplicate parcel_ids:
non_unique_parcel_id_rows = engine_dor_parcel_rows.duplicates(key='parcel_id')
unique_parcel_id_rows = etl.complement(engine_dor_parcel_rows,
                                       non_unique_parcel_id_rows)

# Get address comps for condos by joining to dor_parcel with unique parcel_id on parcel_id:
print("Relating condos to parcels...")
joined = etl.join(source_dor_condo_rows, unique_parcel_id_rows, key='parcel_id') \
    .convert('street_address', lambda a, row: row.street_address + ' # ' + row.unit_num, pass_row=True)
print("joined rowcount: ", etl.nrows(joined))
if DEV:
    print(etl.look(joined))

# Calculate errors
print("Calculating errors...")
unjoined = etl.antijoin(source_dor_condo_rows, joined, key='source_object_id')
print("unjoined rowcount: ", etl.nrows(unjoined))
dor_condos_unjoined_unmatched = etl.antijoin(unjoined,
                                             non_unique_parcel_id_rows,
                                             key='parcel_id').addfield(
                                                 'reason',
                                                 'non-active/remainder mapreg')
print("non-active/remainder mapreg error rowcount: ",
      etl.nrows(dor_condos_unjoined_unmatched))
if DEV:
示例#30
0
    select parcel_id, street_address, address_low, address_low_suffix, address_low_frac, address_high, street_predir, 
    street_name, street_suffix, street_postdir, street_full from {dor_parcel_table}
    '''.format(dor_parcel_table='dor_parcel')
engine_dor_parcel_rows = etl.fromdb(pg_db, dor_parcel_read_stmt)
if DEV:
    print(etl.look(engine_dor_parcel_rows))

# Get duplicate parcel_ids:
non_unique_parcel_id_rows = engine_dor_parcel_rows.duplicates(key='parcel_id')
unique_parcel_id_rows = etl.complement(engine_dor_parcel_rows, non_unique_parcel_id_rows)

# Get address comps for condos by joining to dor_parcel with unique parcel_id on parcel_id:
print("Relating condos to parcels...")
joined = etl.join(source_dor_condo_rows, unique_parcel_id_rows, key='parcel_id') \
    .convert('street_address', lambda a, row: row.street_address + ' # ' + row.unit_num, pass_row=True)
print("joined rowcount: ", etl.nrows(joined))
if DEV:
    print(etl.look(joined))

# Calculate errors
print("Calculating errors...")
unjoined = etl.antijoin(source_dor_condo_rows, joined, key='source_object_id')
print("unjoined rowcount: ", etl.nrows(unjoined))
dor_condos_unjoined_unmatched = etl.antijoin(unjoined, non_unique_parcel_id_rows, key='parcel_id').addfield('reason', 'non-active/remainder mapreg')
print("non-active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_unmatched))
if DEV:
    print(etl.look(dor_condos_unjoined_unmatched))
dor_condos_unjoined_duplicates = etl.antijoin(unjoined, dor_condos_unjoined_unmatched, key='source_object_id').addfield('reason', 'non-unique active/remainder mapreg')
print("non-unique active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_duplicates))
if DEV:
    print(etl.look(dor_condos_unjoined_duplicates))
示例#31
0
from __future__ import division, print_function, absolute_import


# nrows()
#########

import petl as etl
table = [['foo', 'bar'], ['a', 1], ['b', 2]]
etl.nrows(table)


# valuecount()
##############

import petl as etl
table = [['foo', 'bar'],
         ['a', 1],
         ['b', 2],
         ['b', 7]]
etl.valuecount(table, 'foo', 'b')


# valuecounter()
################

import petl as etl
table = [['foo', 'bar'],
         ['a', True],
         ['b'],
         ['b', True],
         ['c', False]]
示例#32
0
"""
DB-related tests, separated from main unit tests because they need local database
setup prior to running.

"""

import sys
sys.path.insert(0, './src')
from petl import cache, nrows
import logging
logging.basicConfig(level=logging.DEBUG)

t = (('foo', 'bar'),
     ('C', 2),
     ('A', 9),
     ('B', 6),
     ('E', 1),
     ('D', 10))
u = cache(t)
nrows(u)
nrows(u)