示例#1
0
def test_values():
    """Test the values function."""

    table = (('foo', 'bar', 'baz'), ('a', 1, True), ('b', 2), ('b', 7, False))

    actual = values(table, 'foo')
    expect = ('a', 'b', 'b')
    ieq(expect, actual)
    ieq(expect, actual)

    actual = values(table, 'bar')
    expect = (1, 2, 7)
    ieq(expect, actual)
    ieq(expect, actual)

    # old style signature for multiple fields, still supported
    actual = values(table, ('foo', 'bar'))
    expect = (('a', 1), ('b', 2), ('b', 7))
    ieq(expect, actual)
    ieq(expect, actual)

    # as of 0.24 new style signature for multiple fields
    actual = values(table, 'foo', 'bar')
    expect = (('a', 1), ('b', 2), ('b', 7))
    ieq(expect, actual)
    ieq(expect, actual)

    actual = values(table, 'baz')
    expect = (True, None, False)
    ieq(expect, actual)
    ieq(expect, actual)
示例#2
0
def test_values():
    """Test the values function."""
    
    table = (('foo', 'bar', 'baz'), 
             ('a', 1, True), 
             ('b', 2), 
             ('b', 7, False))

    actual = values(table, 'foo')
    expect = ('a', 'b', 'b')
    ieq(expect, actual) 
    ieq(expect, actual) 

    actual = values(table, 'bar')
    expect = (1, 2, 7)
    ieq(expect, actual) 
    ieq(expect, actual) 
    
    actual = values(table, ('foo', 'bar'))
    expect = (('a', 1), ('b', 2), ('b', 7))
    ieq(expect, actual) 
    ieq(expect, actual) 
    
    actual = values(table, 'baz')
    expect = (True, None, False)
    ieq(expect, actual)
    ieq(expect, actual) 
示例#3
0
文件: test_util.py 项目: talwai/petl
def test_values():
    """Test the values function."""

    table = (("foo", "bar", "baz"), ("a", 1, True), ("b", 2), ("b", 7, False))

    actual = values(table, "foo")
    expect = ("a", "b", "b")
    ieq(expect, actual)
    ieq(expect, actual)

    actual = values(table, "bar")
    expect = (1, 2, 7)
    ieq(expect, actual)
    ieq(expect, actual)

    # old style signature for multiple fields, still supported
    actual = values(table, ("foo", "bar"))
    expect = (("a", 1), ("b", 2), ("b", 7))
    ieq(expect, actual)
    ieq(expect, actual)

    # as of 0.24 new style signature for multiple fields
    actual = values(table, "foo", "bar")
    expect = (("a", 1), ("b", 2), ("b", 7))
    ieq(expect, actual)
    ieq(expect, actual)

    actual = values(table, "baz")
    expect = (True, None, False)
    ieq(expect, actual)
    ieq(expect, actual)
示例#4
0
def test_values():
    """Test the values function."""
    
    table = (('foo', 'bar', 'baz'), 
             ('a', 1, True), 
             ('b', 2), 
             ('b', 7, False))

    actual = values(table, 'foo')
    expect = ('a', 'b', 'b')
    ieq(expect, actual) 
    ieq(expect, actual) 

    actual = values(table, 'bar')
    expect = (1, 2, 7)
    ieq(expect, actual) 
    ieq(expect, actual) 
    
    actual = values(table, ('foo', 'bar'))
    expect = (('a', 1), ('b', 2), ('b', 7))
    ieq(expect, actual) 
    ieq(expect, actual) 
    
    actual = values(table, 'baz')
    expect = (True, None, False)
    ieq(expect, actual)
    ieq(expect, actual) 
示例#5
0
def test_values():
    """Test the values function."""
    
    table = (('foo', 'bar', 'baz'), 
             ('a', 1, True), 
             ('b', 2), 
             ('b', 7, False))

    actual = values(table, 'foo')
    expect = ('a', 'b', 'b')
    ieq(expect, actual) 
    ieq(expect, actual) 

    actual = values(table, 'bar')
    expect = (1, 2, 7)
    ieq(expect, actual) 
    ieq(expect, actual) 

    # old style signature for multiple fields, still supported
    actual = values(table, ('foo', 'bar'))
    expect = (('a', 1), ('b', 2), ('b', 7))
    ieq(expect, actual) 
    ieq(expect, actual) 

    # as of 0.24 new style signature for multiple fields
    actual = values(table, 'foo', 'bar')
    expect = (('a', 1), ('b', 2), ('b', 7))
    ieq(expect, actual)
    ieq(expect, actual)

    actual = values(table, 'baz')
    expect = (True, None, False)
    ieq(expect, actual)
    ieq(expect, actual) 
示例#6
0
文件: get_delta.py 项目: hessbag/ketl
def get_delta(source_table, target_table, key='id'):
    source_table_headers = etl.header(source_table)
    target_table_headers = etl.header(target_table)

    if source_table_headers != target_table_headers:
        raise Exception(
            'Source table columns do not match target table columns')

    source_ids = etl.cut(source_table, key)
    target_ids = etl.cut(target_table, key)
    added_ids_table, _ = etl.diff(source_ids, target_ids)

    merged_table = etl.merge(source_table, target_table, key=key)

    load_frame = etl.todataframe(
        etl.selectin(target_table, key, etl.values(added_ids_table, key)))
    print(load_frame)

    for row in etl.data(merged_table):
        for i, col in enumerate(row):
            if isinstance(col, etl.transform.reductions.Conflict):
                changes = tuple(col)
                print('For car {}, {} changed from {} to {}'.format(
                    row[0], source_table_headers[i], changes[1], changes[0]))
                row_dict = dict(zip(source_table_headers, list(row)))
                row_dict[source_table_headers[i]] = changes[0]
                row_dict = {key: [val] for (key, val) in row_dict.items()}
                print(row_dict)
                df = pd.DataFrame(row_dict)
                load_frame = load_frame.append(df, ignore_index=True)
                break

    return etl.fromdataframe(load_frame)
示例#7
0
文件: wl.py 项目: NMBGMR/WDIETL
 def _has_observations(self, record):
     sql = '''select count(PointID) from dbo.WaterLevelsContinuous_Pressure
     where PointID=%s'''
     pid = record['PointID']
     table = petl.fromdb(nm_aquifier_connection(), sql, (pid, ))
     nobs = petl.values(table, '')[0]
     print(f'{pid} has nobs={nobs}')
     return bool(nobs)
示例#8
0
 def get_max_dt(self):
     """
     Gets the current maximum date in the table
     :return:
     """
     sql = 'select max(dt) as max_dt from ol_transactions'
     self.log("SQL: {0}".format(sql))
     table = petl.fromdb(self.connection, sql)
     max_dt = petl.values(table, 'max_dt')[0]
     return max_dt
示例#9
0
def get_calls_per_office(parsons_table):
    target_list = []
    for targets in petl.values(parsons_table.table, "target_names"):
        for target in targets:
            target_list.append(target)

    counter = collections.Counter(target_list)
    calls_counter = dict(counter)
    calls_per_office = [{"name" : key, "num_calls": value} for key, value in calls_counter.items()]
    return Table(calls_per_office)
示例#10
0
def test_minimal_start():
    data = [
        ['dur', 'description', 'start'],
        [timedelta(), 'test 1',
         datetime(2000, 1, 1, 0, 15)],
        [timedelta(), 'test 1',
         datetime(2000, 1, 1, 20, 15)],
    ]

    result = group_entries_by_day(data)
    assert set(petl.values(result, 'start')) == {datetime(2000, 1, 1, 0, 15)}
示例#11
0
def sep_valuecounter(table, col_name, sep_char=';'):
    dict_sep = {}
    for value in petl.values(table, col_name):
        if value.strip() == '':
            continue
        else:
            for sep in value.split(sep_char):
                sep_str = sep.strip()
                if sep_str not in dict_sep:
                    dict_sep[sep_str] = 0
                dict_sep[sep_str] += 1
    return dict_sep
示例#12
0
def sep_valuecounter(table, col_name, sep_char=';'):
    dict_sep = {}
    for value in petl.values(table, col_name):
        if value.strip() == '':
            continue
        else:
            for sep in value.split(sep_char):
                sep_str = sep.strip()
                if sep_str not in dict_sep:
                    dict_sep[sep_str] = 0
                dict_sep[sep_str] += 1
    return dict_sep
示例#13
0
    def get_min_dt(self, last):
        """
        Gets the minimum date considering previous extractions from the table.
        :param last:
        :return:
        """
        if last is None or len(last) == 0:
            sql = "select min(dt) as min_dt from ol_transactions"
        else:
            sql = "select min(dt) as min_dt from ol_transactions where dt >= '{0}'".format(last)

        self.log("SQL: {0}".format(sql))
        table = petl.fromdb(self.connection, sql)
        extract_dt = petl.values(table, 'min_dt')[0]
        return extract_dt
示例#14
0
def test_sum_duration():
    data = [
        ['dur', 'description', 'start'],
        [timedelta(minutes=1), 'test 1',
         datetime(2000, 1, 1, 15, 15)],
        [timedelta(minutes=1), 'test 1',
         datetime(2000, 1, 1, 20, 15)],
        [timedelta(hours=2), 'test 1',
         datetime(2000, 1, 20, 15, 15)],
        [timedelta(hours=1), 'test 1',
         datetime(2000, 1, 20, 20, 15)],
    ]

    result = group_entries_by_day(data)
    assert set(petl.values(
        result, 'dur')) == {timedelta(minutes=2),
                            timedelta(hours=3)}
示例#15
0
def valuecounts(table, col_name):
    return_dict = {}
    reported_count = 0
    unreported_count = 0
    column = petl.values(table, col_name)
    nrows = petl.nrows(table)
    non_blanks = petl.select(table, '{' + quote_single_quote(col_name) + "} != ''")
    num_blanks = nrows - petl.nrows(non_blanks)
    counts_table = petl.valuecounts(non_blanks, col_name)
    for row in petl.records(counts_table):
        if row['frequency'] > 0.01:
            return_dict[row[col_name]] = row['count']
            reported_count += row['count']
        else:
            unreported_count += row['count']
    return_dict['<other>'] = unreported_count
    return_dict['<blank>'] = num_blanks
    return return_dict
示例#16
0
def load_split_transaction_details(list_split_details_files):
    dict_split_transaction_details = {}
    if list_split_details_files is not None:
        for split_details_file in list_split_details_files:
            assert os.path.isfile(split_details_file), "Error: cannot open file '" + split_details_file + "'"
            table = petl.fromcsv(split_details_file)
            account_name = get_account_name(petl.values(table, 'Account'))
            for row in petl.records(table):
                if row['Account'] == 'Split Transaction':
                    string_key = row['Individual ID'] + ',' + row['Batch Date']
                    if string_key not in dict_split_transaction_details:
                        dict_split_transaction_details[string_key] = []
                    dict_split_transaction_details[string_key].append({
                        'Account': account_name,
                        'Amount': row['Amount'],
                        'Tax': row['Tax']
                        })
    return dict_split_transaction_details
示例#17
0
    def get_column_max_width(self, column):
        """
        Return the maximum width of the column.

        `Args:`
            column: str
                The column name.
        `Returns:`
            int
        """

        max_width = 0

        for v in petl.values(self.table, column):

            if len(str(v)) > max_width:
                max_width = len(str(v))

        return max_width
示例#18
0
    def phone(self, table):
        """
        Match based on a list of 500 phones numbers. Table
        can contain up to 500 phone numbers to match

        `Args:`
            table: parsons table
                See :ref:`parsons-table`. One row per phone number,
                up to 500 phone numbers.
        `Returns:`
            See :ref:`parsons-table` for output options.
        """

        url = self.connection.uri + 'person/phone-search'

        args = {'phones': list(petl.values(table.table, 0))}

        return Table(
            self.connection.request(url, args=args, raw=True)['result'])
示例#19
0
def valuecounts(table, col_name):
    return_dict = {}
    reported_count = 0
    unreported_count = 0
    column = petl.values(table, col_name)
    nrows = petl.nrows(table)
    non_blanks = petl.select(table,
                             '{' + quote_single_quote(col_name) + "} != ''")
    num_blanks = nrows - petl.nrows(non_blanks)
    counts_table = petl.valuecounts(non_blanks, col_name)
    for row in petl.records(counts_table):
        if row['frequency'] > 0.01:
            return_dict[row[col_name]] = row['count']
            reported_count += row['count']
        else:
            unreported_count += row['count']
    return_dict['<other>'] = unreported_count
    return_dict['<blank>'] = num_blanks
    return return_dict
示例#20
0
    def process_records(self):
        """
        Handles querying and extraction
        :return:
        """
        rows = petl.values(self.table, 'dt', 'total', 'duration')
        row_count = 0
        measurements = []
        properties = {'app_id': self.app_id}
        source = "littledog.com"
        for row in rows:
            timestamp = int(row[0].strftime('%s'))
            total = int(row[1])
            duration = int(row[2])
            logging.debug("Add Measurements => dt: {0}, total: {1}, duration: {2} ".format(timestamp, total, duration))
            row_count += 1
            measurements.append(Measurement(metric='ONLINE_TRANSACTION_COUNT',
                                            source=source,
                                            value=total,
                                            timestamp=timestamp,
                                            properties=properties))
            measurements.append(Measurement(metric='ONLINE_TRANSACTION_TIME',
                                            source=source,
                                            value=duration,
                                            timestamp=timestamp,
                                            properties=properties))

            # Send when we have batch of 10 measurements
            if row_count == 10:
                # send measurements
                self.send_measurements(measurements)
                measurements = []
                row_count = 0

        # If there are any remaining measurements send them on
        if len(measurements) > 0:
            self.api.measurement_create_batch(measurements)
示例#21
0
def order_by_constraint(base_path, table, schema, self_dep_set):
    file_name = base_path + "/content/data/" + table + ".tsv"
    tempfile = NamedTemporaryFile(mode='w',
                                  dir=base_path + "/content/data/",
                                  delete=False)
    table = etl.fromcsv(file_name,
                        delimiter='\t',
                        skipinitialspace=True,
                        quoting=csv.QUOTE_NONE,
                        quotechar='',
                        escapechar='')

    key_dep_dict = {}

    # print(file_name)
    for constraint in self_dep_set:
        child_dep, parent_dep = constraint.split(':')
        data = etl.values(table, child_dep, parent_dep)
        for d in data:
            key_dep_set = {d[1]}
            key_dep_dict.update({d[0]: key_dep_set})

    key_dep_list = toposort_flatten(key_dep_dict)
    table = etl.addfield(table, 'pwb_index',
                         lambda rec: int(key_dep_list.index(rec[child_dep])))
    table = etl.sort(table, 'pwb_index')
    table = etl.cutout(table, 'pwb_index')

    writer = csv.writer(tempfile,
                        delimiter='\t',
                        quoting=csv.QUOTE_NONE,
                        quotechar='',
                        lineterminator='\n',
                        escapechar='')

    writer.writerows(table)
    shutil.move(tempfile.name, file_name)
def split_dataset(dataset, p_train_data, split_mode):

    fields = list(fieldnames(dataset))
    
    size_dataset = len(values(dataset, fields[0])) 
    size_train_data = int(round(size_dataset * p_train_data))
    size_test_data = abs(size_train_data - size_dataset)


    if split_mode == 'normal' :

        train_data = head(dataset, size_train_data - 1)
        
        if size_test_data == 0:
            
            test_data = []
            
        else:
            
            test_data = tail(dataset, size_test_data - 1)

    #################### Falta incluir Shuffle mode ###############

    return train_data, test_data
                    tempfile = NamedTemporaryFile(mode='w',
                                                  dir=base_path +
                                                  "/content/data/",
                                                  delete=False)
                    table = etl.fromcsv(file_name,
                                        delimiter='\t',
                                        skipinitialspace=True,
                                        quoting=csv.QUOTE_NONE,
                                        quotechar='',
                                        escapechar='')
                    key_dep_dict = {}

                    print(file_name)
                    for constraint in value:
                        child_dep, parent_dep = constraint.split(':')
                        data = etl.values(table, child_dep, parent_dep)
                        for d in data:
                            key_dep_set = {d[1]}
                            key_dep_dict.update({d[0]: key_dep_set})

                    key_dep_list = toposort_flatten(key_dep_dict)
                    table = etl.addfield(
                        table, 'pwb_index',
                        lambda rec: int(key_dep_list.index(rec[child_dep])))
                    table = etl.sort(table, 'pwb_index')
                    table = etl.cutout(table, 'pwb_index')

                    writer = csv.writer(tempfile,
                                        delimiter='\t',
                                        quoting=csv.QUOTE_NONE,
                                        quotechar='',
示例#24
0
try:
    # if the user provided a database password
    if args.database_pass:
        conn = psycopg2.connect(
            "dbname={} user={} password={} host='localhost'".format(
                args.database_name, args.database_user, args.database_pass))
    # otherwise, connect without a password
    else:
        conn = psycopg2.connect("dbname={} user={} host='localhost'".format(
            args.database_name, args.database_user))
except:
    print("Unable to connect to the database")
    exit()

# get these fields from the CSV
for record in etl.values(records, args.from_field_name, args.to_field_name):
    with conn:
        with conn.cursor() as curs:
            # resource_type_id 2 is metadata for items
            sql = "SELECT text_value FROM metadatavalue WHERE resource_type_id=2 AND metadata_field_id=%s AND text_value=%s"
            curs.execute(sql, (args.metadata_field_id, record[0]))
            records_to_fix = curs.rowcount

    print("Fixing {} occurences of: {}".format(records_to_fix, record[0]))

    with conn:
        with conn.cursor() as curs:
            sql = "UPDATE metadatavalue SET text_value=%s WHERE resource_type_id=2 AND metadata_field_id=%s AND text_value=%s"
            curs.execute(sql, (record[1], args.metadata_field_id, record[0]))
            rows_updated = curs.rowcount
            print("> {} occurences updated".format(rows_updated))
def extract_values(dataset, field_position):

    fields = list(fieldnames(dataset))
    field_values = values(dataset, field_position)
    
    return field_values
    def _get_field_sample(resource_sample, field):
        """Return a subset of the relevant data column."""

        sample_table = petl.fromdicts(resource_sample)
        sample_column = list(petl.values(sample_table, field['name']))
        return sample_column
示例#27
0
文件: pet.py 项目: pjakobsen/shifty
def compact(s):
    shift = s['role']['name'][0:14] + " " + s['shift']['start'][11:16].lstrip("0") + " " + s['user']['firstname'] + " " + s['user']['lastname'][0:1]
    #shift["Notes"]=s['shift']['notes']
    return shift


data = [mini(s) for s in db]
pprint(data)
table = etl.fromdicts(data)

pprint(etl.look(table))


#Using set get a list of distinct departments

departments = (set(etl.values(table,"Department")))
pprint(departments)

for d in departments:
    print(d)
    table2 = etl.select(table, 'Department', lambda v: v == d ).facet("Day" )
    report = (
          etl
         .empty()
         .addcolumn('12-13', list(table2["2017-12-13"].values("Shift")))
         .addcolumn('12-14', list(table2["2017-12-14"].values("Shift")))
         .addcolumn('12-15', list(table2["2017-12-15"].values("Shift")))
         .addcolumn('12-16', list(table2["2017-12-16"].values("Shift")))
         .addcolumn('12-17', list(table2["2017-12-17"].values("Shift")))

         )
    def _get_field_sample(resource_sample, field):
        """Return a subset of the relevant data column."""

        sample_table = petl.fromdicts(resource_sample)
        sample_column = list(petl.values(sample_table, field['name']))
        return sample_column
示例#29
0
# selecting the data from table on the basis of current date
# variable num consist of only the data of 2020-04-30 from each country.Hence the latest data is filter out.
num = etl.select(data, 'date', lambda r: r == '2020-04-30')

# sort function is used to sort the unsorted data on the basis of iso_code
# thus ,this process help us to join the data easily in furthur steps
table1_sort = etl.sort(num, key='iso_code')

# counter variable is declared to count the number of country
count = 0

# values function is used to read the data from table
for i in etl.values(table1_sort, 'iso_code', 'location', 'date', 'total_cases',
                    'new_cases', 'total_deaths', 'new_deaths',
                    'total_cases_per_million', 'new_cases_per_million',
                    'total_deaths_per_million', 'new_deaths_per_million',
                    'total_tests', 'new_tests', 'total_tests_per_thousand',
                    'new_tests_per_thousand', 'tests_units'):

    # condition to take 15 countries from the table for intregration
    # thus 16 is given because when count will be 0 it contain the unnecessary data.
    if count == 16:
        break

    # thus data1 was already declared with header on above code and other data are being appended
    data1.append(i)
    count = count + 1

# removing the unnecessary data from the list which was in data1[1]
data1.pop(1)
示例#30
0
from __future__ import division, print_function, absolute_import, unicode_literals


# values()
##########

import petl as etl

table1 = [["foo", "bar"], ["a", True], ["b"], ["b", True], ["c", False]]
foo = etl.values(table1, "foo")
foo
list(foo)
bar = etl.values(table1, "bar")
bar
list(bar)
# values from multiple fields
table2 = [["foo", "bar", "baz"], [1, "a", True], [2, "bb", True], [3, "d", False]]
foobaz = etl.values(table2, "foo", "baz")
foobaz
list(foobaz)


# header()
##########


import petl as etl

table = [["foo", "bar"], ["a", 1], ["b", 2]]
etl.header(table)
示例#31
0
def transform(mmj_menu_items, mmj_categories, prices, organization_id,
              source_db, debug):
    """
    Transform data
    """
    # source data table
    source_dt = utils.view_to_list(mmj_menu_items)

    cut_menu_data = [
        'id', 'vendor_id', 'menu_id', 'dispensary_id', 'strain_id',
        'created_at', 'updated_at', 'category_id', 'name', 'sativa', 'indica',
        'on_hold', 'product_type', 'image_file_name', 'medicine_amount',
        'product_type'
    ]

    cut_prices = [
        'menu_item_id', 'dispensary_id', 'price_half_gram', 'price_gram',
        'price_two_gram', 'price_eigth', 'price_quarter', 'price_half',
        'price_ounce'
    ]

    # Cut out all the fields we don't need to load
    menu_items = etl.cut(source_dt, cut_menu_data)
    prices_data = etl.cut(prices, cut_prices)

    menu_items = (etl.addfield(
        menu_items, 'createdAtEpoch').addfield('unitOfMeasure').addfield(
            'locationProductDetails').addfield('keys').addfield('restockLevel')
                  )

    # Two-step transform and cut. First we need to cut the name
    # and id from the source data to map to.
    cut_source_cats = etl.cut(mmj_categories, 'name', 'id', 'measurement')
    source_values = etl.values(cut_source_cats, 'name', 'id')

    # Then we nede a dict of categories to compare against.
    # id is stored to match against when transforming and mapping categories
    mmj_categories = dict([(value, id) for (value, id) in source_values])

    mappings = OrderedDict()
    mappings['id'] = 'id'
    mappings['createdAt'] = 'created_at'
    mappings['updatedAt'] = 'updated_at'
    mappings['createdAtEpoch'] = lambda x: utils.create_epoch(x.created_at)
    mappings['name'] = 'name'
    mappings['shareOnWM'] = lambda x: _wm_integration(x.id, source_db)
    """
    1 = Units
    2 = Grams (weight)
    """
    mappings['unitOfMeasure'] = \
        lambda x: _map_uom(x.category_id, source_db)

    fields = etl.fieldmap(menu_items, mappings)
    data = etl.merge(menu_items, fields, key='id')

    items = []
    for item in etl.dicts(data):

        breakpoint_pricing = (etl.select(
            prices_data,
            lambda x: x.dispensary_id == item['dispensary_id']).rename({
                'price_eigth':
                'price_eighth'
            }).cutout('menu_item_id'))
        # Set image url for load to download
        url = None
        if debug and item['image_file_name'] is not None:
            url = ("https://wm-mmjmenu-images-development.s3."
                   "amazonaws.com/menu_items/images/{0}/large/"
                   "{1}").format(item['id'], item['image_file_name'])
        elif item['image_file_name'] is not None:
            url = ("https://wm-mmjmenu-images-production.s3."
                   "amazonaws.com/menu_items/images/{0}/large/"
                   "{1}").format(item['id'], item['image_file_name'])

        item['image_file_name'] = url

        item['categoryId'] = _map_categories(item['category_id'],
                                             item['sativa'], item['indica'],
                                             mmj_categories, menu_items)
        item['keys'] = {
            'dispensary_id': item['dispensary_id'],
            'id': item['id'],
            'menu_id': item['menu_id'],
            'vendor_id': item['vendor_id'],
            'strain_id': item['strain_id'],
            'category_id': item['category_id']
        }

        # set a default netMJ value if the menu item is a unit product
        if item['unitOfMeasure'] is 2:
            item['netMarijuana'] = int(item['medicine_amount'])

        for key in item['keys'].keys():
            if not item['keys'][key]:
                del item['keys'][key]

        item['locationProductDetails'] = {
            'id': item['id'],
            'active': _active(item['on_hold'])
        }

        item['restockLevel'] = _restock_level(item['dispensary_id'],
                                              item['product_type'], source_db)

        if item['shareOnWM'] is None:
            item['shareOnWM'] = False

        for price in etl.dicts(breakpoint_pricing):
            try:
                price_two_gram = price['price_two_gram']
            except KeyError:
                price_two_gram = 0.0

            item['locationProductDetails']['weightPricing'] = {
                'price_half_gram':
                utils.dollars_to_cents(price['price_half_gram']),
                'price_two_gram': utils.dollars_to_cents(price_two_gram),
                'price_gram': utils.dollars_to_cents(price['price_gram']),
                'price_eighth': utils.dollars_to_cents(price['price_eighth']),
                'price_quarter':
                utils.dollars_to_cents(price['price_quarter']),
                'price_half': utils.dollars_to_cents(price['price_half']),
                'price_ounce': utils.dollars_to_cents(price['price_ounce'])
            }

        del item['vendor_id']
        del item['indica']
        del item['dispensary_id']
        del item['id']
        del item['strain_id']
        del item['on_hold']
        del item['menu_id']
        del item['sativa']
        del item['category_id']
        del item['updated_at']
        del item['created_at']
        del item['product_type']

        if item['image_file_name'] is None:
            del item['image_file_name']

        # set up final structure for API
        items.append(item)

    # Remove inactive items
    for item in items:
        if item['locationProductDetails']['active'] is False:
            items.remove(item)

    if debug:
        result = json.dumps(items,
                            sort_keys=True,
                            indent=4,
                            default=utils.json_serial)
        print(result)

    return items
示例#32
0
from __future__ import division, print_function, absolute_import, \
    unicode_literals

# values()
##########

import petl as etl
table1 = [['foo', 'bar'], ['a', True], ['b'], ['b', True], ['c', False]]
foo = etl.values(table1, 'foo')
foo
list(foo)
bar = etl.values(table1, 'bar')
bar
list(bar)
# values from multiple fields
table2 = [['foo', 'bar', 'baz'], [1, 'a', True], [2, 'bb', True],
          [3, 'd', False]]
foobaz = etl.values(table2, 'foo', 'baz')
foobaz
list(foobaz)

# header()
##########

import petl as etl
table = [['foo', 'bar'], ['a', 1], ['b', 2]]
etl.header(table)

# fieldnames()
##############
示例#33
0
def convert_folder(base_source_dir,
                   base_target_dir,
                   tmp_dir,
                   tika=False,
                   ocr=False,
                   merge=False,
                   tsv_source_path=None,
                   tsv_target_path=None,
                   make_unique=True,
                   sample=False,
                   zip=False):
    # WAIT: Legg inn i gui at kan velge om skal ocr-behandles
    txt_target_path = base_target_dir + '_result.txt'
    json_tmp_dir = base_target_dir + '_tmp'
    converted_now = False
    errors = False
    originals = False

    if merge is False:  # TODO: Trengs begge argumentene?
        make_unique = False

    if tsv_source_path is None:
        tsv_source_path = base_target_dir + '.tsv'
    else:
        txt_target_path = os.path.splitext(
            tsv_source_path)[1][1:] + '_result.txt'

    if tsv_target_path is None:
        tsv_target_path = base_target_dir + '_processed.tsv'

    if os.path.exists(tsv_target_path):
        os.remove(tsv_target_path)

    Path(base_target_dir).mkdir(parents=True, exist_ok=True)

    # TODO: Viser mime direkte om er pdf/a eller må en sjekke mot ekstra felt i de to under? Forsjekk om Tika og siegfried?

    # TODO: Trengs denne sjekk om tsv her. Gjøres sjekk før kaller denne funskjonen og slik at unødvendig?
    if not os.path.isfile(tsv_source_path):
        if tika:
            run_tika(tsv_source_path, base_source_dir, json_tmp_dir, zip)
        else:
            run_siegfried(base_source_dir, tmp_dir, tsv_source_path, zip)

    # TODO: Legg inn test på at tsv-fil ikke er tom
    replace_text_in_file(tsv_source_path, '\0', '')

    table = etl.fromtsv(tsv_source_path)
    table = etl.rename(table, {
        'filename': 'source_file_path',
        'tika_batch_fs_relative_path': 'source_file_path',
        'filesize': 'file_size',
        'mime': 'mime_type',
        'Content_Type': 'mime_type',
        'Version': 'version'
    },
                       strict=False)

    thumbs_table = etl.select(
        table, lambda rec: Path(rec.source_file_path).name == 'Thumbs.db')
    if etl.nrows(thumbs_table) > 0:
        thumbs_paths = etl.values(thumbs_table, 'source_file_path')
        for path in thumbs_paths:
            if '/' not in path:
                path = os.path.join(base_source_dir, path)
            if os.path.isfile(path):
                os.remove(path)

        table = etl.select(
            table, lambda rec: Path(rec.source_file_path).name != 'Thumbs.db')

    table = etl.select(table, lambda rec: rec.source_file_path != '')
    table = etl.select(table, lambda rec: '#' not in rec.source_file_path)
    # WAIT: Ikke fullgod sjekk på embedded dokument i linje over da # faktisk kan forekomme i filnavn
    row_count = etl.nrows(table)

    file_count = sum([len(files) for r, d, files in os.walk(base_source_dir)])

    if row_count == 0:
        print('No files to convert. Exiting.')
        return 'Error', file_count
    elif file_count != row_count:
        print('Row count: ' + str(row_count))
        print('File count: ' + str(file_count))
        print("Files listed in '" + tsv_source_path +
              "' doesn't match files on disk. Exiting.")
        return 'Error', file_count
    elif not zip:
        print('Converting files..')

    # WAIT: Legg inn sjekk på filstørrelse før og etter konvertering

    append_fields = ('version', 'norm_file_path', 'result',
                     'original_file_copy', 'id')
    table = add_fields(append_fields, table)

    cut_fields = ('0', '1', 'X_TIKA_EXCEPTION_runtime',
                  'X_TIKA_EXCEPTION_warn')
    table = remove_fields(cut_fields, table)

    header = etl.header(table)
    append_tsv_row(tsv_target_path, header)

    # Treat csv (detected from extension only) as plain text:
    table = etl.convert(table,
                        'mime_type',
                        lambda v, row: 'text/plain'
                        if row.id == 'x-fmt/18' else v,
                        pass_row=True)

    # Update for missing mime types where id is known:
    table = etl.convert(table,
                        'mime_type',
                        lambda v, row: 'application/xml'
                        if row.id == 'fmt/979' else v,
                        pass_row=True)

    if os.path.isfile(txt_target_path):
        os.remove(txt_target_path)

    data = etl.dicts(table)
    count = 0
    for row in data:
        count += 1
        count_str = ('(' + str(count) + '/' + str(file_count) + '): ')
        source_file_path = row['source_file_path']
        if '/' not in source_file_path:
            source_file_path = os.path.join(base_source_dir, source_file_path)

        mime_type = row['mime_type']
        # TODO: Virker ikke når Tika brukt -> finn hvorfor
        if ';' in mime_type:
            mime_type = mime_type.split(';')[0]

        version = row['version']
        result = None
        old_result = row['result']

        if not mime_type:
            if os.path.islink(source_file_path):
                mime_type = 'n/a'

            # kind = filetype.guess(source_file_path)
            extension = os.path.splitext(source_file_path)[1][1:].lower()
            if extension == 'xml':
                mime_type = 'application/xml'

        if not zip:
            print_path = os.path.relpath(source_file_path,
                                         Path(base_source_dir).parents[1])
            print(count_str + '.../' + print_path + ' (' + mime_type + ')')

        if mime_type not in mime_to_norm.keys():
            # print("|" + mime_type + "|")

            errors = True
            converted_now = True
            result = 'Conversion not supported'
            append_txt_file(
                txt_target_path,
                result + ': ' + source_file_path + ' (' + mime_type + ')')
            row['norm_file_path'] = ''
            row['original_file_copy'] = ''
        else:
            keep_original = mime_to_norm[mime_type][0]

            if keep_original:
                originals = True

            if zip:
                keep_original = False

            function = mime_to_norm[mime_type][1]

            # Ensure unique file names in dir hierarchy:
            norm_ext = mime_to_norm[mime_type][2]
            if not norm_ext:
                norm_ext = 'none'

            if make_unique:
                norm_ext = (base64.b32encode(
                    bytes(
                        str(count), encoding='ascii'))).decode('utf8').replace(
                            '=', '').lower() + '.' + norm_ext
            target_dir = os.path.dirname(
                source_file_path.replace(base_source_dir, base_target_dir))
            normalized = file_convert(source_file_path,
                                      mime_type,
                                      function,
                                      target_dir,
                                      tmp_dir,
                                      None,
                                      norm_ext,
                                      version,
                                      ocr,
                                      keep_original,
                                      zip=zip)

            if normalized['result'] == 0:
                errors = True
                result = 'Conversion failed'
                append_txt_file(
                    txt_target_path,
                    result + ': ' + source_file_path + ' (' + mime_type + ')')
            elif normalized['result'] == 1:
                result = 'Converted successfully'
                converted_now = True
            elif normalized['result'] == 2:
                errors = True
                result = 'Conversion not supported'
                append_txt_file(
                    txt_target_path,
                    result + ': ' + source_file_path + ' (' + mime_type + ')')
            elif normalized['result'] == 3:
                if old_result not in ('Converted successfully',
                                      'Manually converted'):
                    result = 'Manually converted'
                    converted_now = True
                else:
                    result = old_result
            elif normalized['result'] == 4:
                converted_now = True
                errors = True
                result = normalized['error']
                append_txt_file(
                    txt_target_path,
                    result + ': ' + source_file_path + ' (' + mime_type + ')')
            elif normalized['result'] == 5:
                result = 'Not a document'

            if normalized['norm_file_path']:
                row['norm_file_path'] = relpath(normalized['norm_file_path'],
                                                base_target_dir)

            file_copy_path = normalized['original_file_copy']
            if file_copy_path:
                file_copy_path = relpath(file_copy_path, base_target_dir)
            row['original_file_copy'] = file_copy_path

        row['result'] = result
        row_values = list(row.values())

        # TODO: Fikset med å legge inn escapechar='\\' i append_tsv_row -> vil det skal problemer senere?
        # row_values = [r.replace('\n', ' ') for r in row_values if r is not None]
        append_tsv_row(tsv_target_path, row_values)

        if sample and count > 9:
            break

    if not sample:
        shutil.move(tsv_target_path, tsv_source_path)
    # TODO: Legg inn valg om at hvis merge = true kopieres alle filer til mappe på øverste nivå og så slettes tomme undermapper

    msg = None
    if sample:
        msg = 'Sample files converted.'
        if errors:
            msg = "Not all sample files were converted. See '" + txt_target_path + "' for details."
    else:
        if converted_now:
            msg = 'All files converted succcessfully.'
            if errors:
                msg = "Not all files were converted. See '" + txt_target_path + "' for details."
        else:
            msg = 'All files converted previously.'

    return msg, file_count, errors, originals  # TODO: Fiks så bruker denne heller for oppsummering til slutt når flere mapper konvertert
示例#34
0
# Open CSV file
stores = etl.fromcsv('stores.csv')

# Open XML document
locations = etl.fromxml('locations.xml', 'store', {'Name': 'Name', 'Lat': 'Lat', 'Lon': 'Lon'})
print(locations)

# Set output
output_table = [["ID", "Name", "Suburb", "State", "Postcode"]]

store_id = 1

# Read through the store.csv to generate output_table
store = etl.cut(stores, 'Name', 'Suburb', 'State', 'Postcode').distinct()
print(store)
for s in etl.values(store, 'Name', 'Suburb', 'State', 'Postcode'):
    output_table.append([store_id, s])
    store_id += 1
print (output_table)

# Merge and join XML and CSV together
merge_output = etl.join(stores, locations, key="Name")
print(merge_output)

store_table = etl.cut(merge_output, 'ID', 'Name', 'Suburb', 'State', 'Postcode', 'Lat', 'Lon')
print(etl.head(store_table, 5))

# Export to CSV file
etl.tocsv(merge_output, 'store_locations.csv')

示例#35
0
            emails.append(rec['first_name'] + '.' + rec['last_name'] +
                          '@mycompany.com')
        data2 = etl.addcolumn(data, 'email', emails)
    else:
        data2 = data
    etl.todb(data2, mysql_engine, table, create=True)

# load CSV file
data = etl.fromcsv(source=socialmedia_csv)
recs = etl.records(data)
# determine employee numbers
empnos = []
for rec in recs:
    sub = etl.fromdb(
        sqlite_engine,
        "SELECT emp_no FROM employees " + "where last_name = '" +
        rec['last_name'] + "' " + "and first_name = '" + rec['first_name'] +
        "' " + "and birth_date = '" + rec['birth_date'] + "' " +
        "order by birth_date desc " + "limit 1")
    vals = etl.values(sub, 'emp_no')
    if len(vals) > 0:
        empnos.append(vals[0])
    else:
        empnos.append(-1)  # dummy
# adding column gets ignored??
data2 = etl.addcolumn(data, 'emp_no', empnos)
etl.todb(data2, mysql_engine, 'socialmedia', create=True)

end_time = time.time()
print("execution time", end_time - start_time)