Пример #1
0
def transform_to_petl(data):
    isodate = etl.dateparser("%Y-%m-%d")
    data = etl.fromdataframe(data)
    data = etl.rename(data, {"index": "Date", "VALUE": "Value"})
    data = etl.convert(data, {"Date": lambda d: d[:10]})
    data = etl.convert(data, {"Date": lambda d: isodate(d)})
    return data
Пример #2
0
def transform(data,data_set):
	data = data['observations']
	data = etl.fromdicts(data, header=['value','realtime_start','realtime_end','date'])
	data = etl.cut(data,'date','value')	
	data = etl.rename(data,{'date':'date','value': data_set.lower()})
	data = etl.convert(data,data_set.lower(),lambda val: 0 if val == '.' else val)
	return data
Пример #3
0
def unpackcall(tbl, *keys, **kwargs):
    """
    Unpack the call column. E.g.::
    
        >>> from petlx.vcf import fromvcf, unpackinfo, meltsamples, unpackcall
        >>> from petl import look, cutout
        >>> t1 = fromvcf('../fixture/sample.vcf')
        >>> t2 = meltsamples(t1)
        >>> t3 = unpackcall(t2)
        >>> t4 = cutout(t3, 'INFO')
        >>> look(t4)
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | 'CHROM' | 'POS' | 'ID'        | 'REF' | 'ALT' | 'QUAL' | 'FILTER' | 'SAMPLE'  | 'GT'  | 'GQ' | 'DP' | 'HQ'         |
        +=========+=======+=============+=======+=======+========+==========+===========+=======+======+======+==============+
        | '19'    |   111 | None        | 'A'   | [C]   |    9.6 | []       | 'NA00001' | '0|0' | None | None | [10, 10]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '19'    |   111 | None        | 'A'   | [C]   |    9.6 | []       | 'NA00002' | '0|0' | None | None | [10, 10]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '19'    |   111 | None        | 'A'   | [C]   |    9.6 | []       | 'NA00003' | '0/1' | None | None | [3, 3]       |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '19'    |   112 | None        | 'A'   | [G]   |     10 | []       | 'NA00001' | '0|0' | None | None | [10, 10]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '19'    |   112 | None        | 'A'   | [G]   |     10 | []       | 'NA00002' | '0|0' | None | None | [10, 10]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '19'    |   112 | None        | 'A'   | [G]   |     10 | []       | 'NA00003' | '0/1' | None | None | [3, 3]       |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '20'    | 14370 | 'rs6054257' | 'G'   | [A]   |     29 | []       | 'NA00001' | '0|0' |   48 |    1 | [51, 51]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '20'    | 14370 | 'rs6054257' | 'G'   | [A]   |     29 | []       | 'NA00002' | '1|0' |   48 |    8 | [51, 51]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '20'    | 14370 | 'rs6054257' | 'G'   | [A]   |     29 | []       | 'NA00003' | '1/1' |   43 |    5 | [None, None] |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '20'    | 17330 | None        | 'T'   | [A]   |      3 | ['q10']  | 'NA00001' | '0|0' |   49 |    3 | [58, 50]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        
    .. versionadded:: 0.5
    
    """
    if not keys:
        if hasattr(tbl, 'filename'):
            try:
                import vcf as pyvcf
            except ImportError as e:
                raise UnsatisfiedDependency(e, dep_message)
            reader = pyvcf.Reader(filename=tbl.filename)
            # all FORMAT
            keys = reader.formats.keys()
        else:
            tbl = convert(tbl, 'CALL', lambda v: v.data._asdict()) # enable sampling of keys from data
    result = unpackdict(tbl, 'CALL', keys=keys)
    if 'prefix' in kwargs:
        result = rename(result, {k: kwargs['prefix'] + k for k in keys})
    if hasattr(tbl, 'filename'):
        return VCFWrapper(result, tbl.filename)
    else:
        return result
Пример #4
0
def convert_folder(base_source_dir,
                   base_target_dir,
                   tmp_dir,
                   tika=False,
                   ocr=False,
                   merge=False,
                   tsv_source_path=None,
                   tsv_target_path=None,
                   make_unique=True,
                   sample=False,
                   zip=False):
    # WAIT: Legg inn i gui at kan velge om skal ocr-behandles
    txt_target_path = base_target_dir + '_result.txt'
    json_tmp_dir = base_target_dir + '_tmp'
    converted_now = False
    errors = False
    originals = False

    if merge is False:  # TODO: Trengs begge argumentene?
        make_unique = False

    if tsv_source_path is None:
        tsv_source_path = base_target_dir + '.tsv'
    else:
        txt_target_path = os.path.splitext(
            tsv_source_path)[1][1:] + '_result.txt'

    if tsv_target_path is None:
        tsv_target_path = base_target_dir + '_processed.tsv'

    if os.path.exists(tsv_target_path):
        os.remove(tsv_target_path)

    Path(base_target_dir).mkdir(parents=True, exist_ok=True)

    # TODO: Viser mime direkte om er pdf/a eller må en sjekke mot ekstra felt i de to under? Forsjekk om Tika og siegfried?

    # TODO: Trengs denne sjekk om tsv her. Gjøres sjekk før kaller denne funskjonen og slik at unødvendig?
    if not os.path.isfile(tsv_source_path):
        if tika:
            run_tika(tsv_source_path, base_source_dir, json_tmp_dir, zip)
        else:
            run_siegfried(base_source_dir, tmp_dir, tsv_source_path, zip)

    # TODO: Legg inn test på at tsv-fil ikke er tom
    replace_text_in_file(tsv_source_path, '\0', '')

    table = etl.fromtsv(tsv_source_path)
    table = etl.rename(table, {
        'filename': 'source_file_path',
        'tika_batch_fs_relative_path': 'source_file_path',
        'filesize': 'file_size',
        'mime': 'mime_type',
        'Content_Type': 'mime_type',
        'Version': 'version'
    },
                       strict=False)

    thumbs_table = etl.select(
        table, lambda rec: Path(rec.source_file_path).name == 'Thumbs.db')
    if etl.nrows(thumbs_table) > 0:
        thumbs_paths = etl.values(thumbs_table, 'source_file_path')
        for path in thumbs_paths:
            if '/' not in path:
                path = os.path.join(base_source_dir, path)
            if os.path.isfile(path):
                os.remove(path)

        table = etl.select(
            table, lambda rec: Path(rec.source_file_path).name != 'Thumbs.db')

    table = etl.select(table, lambda rec: rec.source_file_path != '')
    table = etl.select(table, lambda rec: '#' not in rec.source_file_path)
    # WAIT: Ikke fullgod sjekk på embedded dokument i linje over da # faktisk kan forekomme i filnavn
    row_count = etl.nrows(table)

    file_count = sum([len(files) for r, d, files in os.walk(base_source_dir)])

    if row_count == 0:
        print('No files to convert. Exiting.')
        return 'Error', file_count
    elif file_count != row_count:
        print('Row count: ' + str(row_count))
        print('File count: ' + str(file_count))
        print("Files listed in '" + tsv_source_path +
              "' doesn't match files on disk. Exiting.")
        return 'Error', file_count
    elif not zip:
        print('Converting files..')

    # WAIT: Legg inn sjekk på filstørrelse før og etter konvertering

    append_fields = ('version', 'norm_file_path', 'result',
                     'original_file_copy', 'id')
    table = add_fields(append_fields, table)

    cut_fields = ('0', '1', 'X_TIKA_EXCEPTION_runtime',
                  'X_TIKA_EXCEPTION_warn')
    table = remove_fields(cut_fields, table)

    header = etl.header(table)
    append_tsv_row(tsv_target_path, header)

    # Treat csv (detected from extension only) as plain text:
    table = etl.convert(table,
                        'mime_type',
                        lambda v, row: 'text/plain'
                        if row.id == 'x-fmt/18' else v,
                        pass_row=True)

    # Update for missing mime types where id is known:
    table = etl.convert(table,
                        'mime_type',
                        lambda v, row: 'application/xml'
                        if row.id == 'fmt/979' else v,
                        pass_row=True)

    if os.path.isfile(txt_target_path):
        os.remove(txt_target_path)

    data = etl.dicts(table)
    count = 0
    for row in data:
        count += 1
        count_str = ('(' + str(count) + '/' + str(file_count) + '): ')
        source_file_path = row['source_file_path']
        if '/' not in source_file_path:
            source_file_path = os.path.join(base_source_dir, source_file_path)

        mime_type = row['mime_type']
        # TODO: Virker ikke når Tika brukt -> finn hvorfor
        if ';' in mime_type:
            mime_type = mime_type.split(';')[0]

        version = row['version']
        result = None
        old_result = row['result']

        if not mime_type:
            if os.path.islink(source_file_path):
                mime_type = 'n/a'

            # kind = filetype.guess(source_file_path)
            extension = os.path.splitext(source_file_path)[1][1:].lower()
            if extension == 'xml':
                mime_type = 'application/xml'

        if not zip:
            print_path = os.path.relpath(source_file_path,
                                         Path(base_source_dir).parents[1])
            print(count_str + '.../' + print_path + ' (' + mime_type + ')')

        if mime_type not in mime_to_norm.keys():
            # print("|" + mime_type + "|")

            errors = True
            converted_now = True
            result = 'Conversion not supported'
            append_txt_file(
                txt_target_path,
                result + ': ' + source_file_path + ' (' + mime_type + ')')
            row['norm_file_path'] = ''
            row['original_file_copy'] = ''
        else:
            keep_original = mime_to_norm[mime_type][0]

            if keep_original:
                originals = True

            if zip:
                keep_original = False

            function = mime_to_norm[mime_type][1]

            # Ensure unique file names in dir hierarchy:
            norm_ext = mime_to_norm[mime_type][2]
            if not norm_ext:
                norm_ext = 'none'

            if make_unique:
                norm_ext = (base64.b32encode(
                    bytes(
                        str(count), encoding='ascii'))).decode('utf8').replace(
                            '=', '').lower() + '.' + norm_ext
            target_dir = os.path.dirname(
                source_file_path.replace(base_source_dir, base_target_dir))
            normalized = file_convert(source_file_path,
                                      mime_type,
                                      function,
                                      target_dir,
                                      tmp_dir,
                                      None,
                                      norm_ext,
                                      version,
                                      ocr,
                                      keep_original,
                                      zip=zip)

            if normalized['result'] == 0:
                errors = True
                result = 'Conversion failed'
                append_txt_file(
                    txt_target_path,
                    result + ': ' + source_file_path + ' (' + mime_type + ')')
            elif normalized['result'] == 1:
                result = 'Converted successfully'
                converted_now = True
            elif normalized['result'] == 2:
                errors = True
                result = 'Conversion not supported'
                append_txt_file(
                    txt_target_path,
                    result + ': ' + source_file_path + ' (' + mime_type + ')')
            elif normalized['result'] == 3:
                if old_result not in ('Converted successfully',
                                      'Manually converted'):
                    result = 'Manually converted'
                    converted_now = True
                else:
                    result = old_result
            elif normalized['result'] == 4:
                converted_now = True
                errors = True
                result = normalized['error']
                append_txt_file(
                    txt_target_path,
                    result + ': ' + source_file_path + ' (' + mime_type + ')')
            elif normalized['result'] == 5:
                result = 'Not a document'

            if normalized['norm_file_path']:
                row['norm_file_path'] = relpath(normalized['norm_file_path'],
                                                base_target_dir)

            file_copy_path = normalized['original_file_copy']
            if file_copy_path:
                file_copy_path = relpath(file_copy_path, base_target_dir)
            row['original_file_copy'] = file_copy_path

        row['result'] = result
        row_values = list(row.values())

        # TODO: Fikset med å legge inn escapechar='\\' i append_tsv_row -> vil det skal problemer senere?
        # row_values = [r.replace('\n', ' ') for r in row_values if r is not None]
        append_tsv_row(tsv_target_path, row_values)

        if sample and count > 9:
            break

    if not sample:
        shutil.move(tsv_target_path, tsv_source_path)
    # TODO: Legg inn valg om at hvis merge = true kopieres alle filer til mappe på øverste nivå og så slettes tomme undermapper

    msg = None
    if sample:
        msg = 'Sample files converted.'
        if errors:
            msg = "Not all sample files were converted. See '" + txt_target_path + "' for details."
    else:
        if converted_now:
            msg = 'All files converted succcessfully.'
            if errors:
                msg = "Not all files were converted. See '" + txt_target_path + "' for details."
        else:
            msg = 'All files converted previously.'

    return msg, file_count, errors, originals  # TODO: Fiks så bruker denne heller for oppsummering til slutt når flere mapper konvertert
Пример #5
0
def substitute(table, field, pattern, repl, count=0, flags=0):
    program = re.compile(pattern, flags)
    convert = lambda tempData: program.sub(repl, tempData, count=count)
    return pt.convert(table, field, convert)
Пример #6
0
from __future__ import division, print_function, absolute_import


# progress()
############

import petl as etl
table = etl.dummytable(100000)
table.progress(10000).tocsv('example.csv')


# clock()
#########

import petl as etl
t1 = etl.dummytable(100000)
c1 = etl.clock(t1)
t2 = etl.convert(c1, 'foo', lambda v: v**2)
c2 = etl.clock(t2)
p = etl.progress(c2, 10000)
etl.tocsv(p, 'example.csv')
# time consumed retrieving rows from t1
c1.time
# time consumed retrieving rows from t2
c2.time
# actual time consumed by the convert step
c2.time - c1.time


Пример #7
0
    'day': date
}
headers = {'content-type': 'application/x-www-form-urlencoded'}
#gets the information from ppms and creates a csv file
r = requests.post(url, data=payload, headers=headers)
f = open('todays_bookings.csv', 'wb')
f.write(r.text)
f.close()

#Load the table
table1 = petl.fromcsv('todays_bookings.csv')
# Alter the columns
table2 = petl.cut(table1, ' Object', ' User', ' Start time', ' End time',
                  ' Training', ' Assisted')
# Reorder the user names
table3 = petl.convert(table2, ' User',
                      lambda row: " ".join(re.findall("\S+", row)[::-1]))
# Reorder the rows
table4 = petl.sort(table3, key=[' Object', ' Start time'])
# Save to new file
petl.tocsv(table4, 'new.csv')

#Reopens the CSV file (stupid, I know) and removes unnecessary characters
csvfile = ""
ppmscal = csv.reader(open('new.csv'), delimiter=',')
for row in ppmscal:
    csvfile += str(row) + '\n'
csvtxt = csvfile.replace("(", "").replace(")", "").replace("'", "").replace(
    "[", "").replace("]", "")
csvtxt = csvtxt[:-1]

Пример #8
0
    def unpack_list(self,
                    column,
                    include_original=False,
                    missing=None,
                    replace=False,
                    max_columns=None):
        """
        Unpack list values from one column into separate columns. Numbers the
        columns.

        .. code-block:: python

          # Begin with a list in column
          json = [{'id': '5421',
                   'name': 'Jane Green',
                   'phones': ['512-699-3334', '512-222-5478']
                  }
                 ]

          tbl = Table(json)
          print (tbl)
          >>> {'id': '5421', 'name': 'Jane Green', 'phones': ['512-699-3334', '512-222-5478']}

          tbl.unpack_list('phones', replace=True)
          print (tbl)
          >>> {'id': '5421', 'name': 'Jane Green', 'phones_0': '512-699-3334', 'phones_1': '512-222-5478'} # noqa: E501

        `Args:`
            column: str
                The column name to unpack
            include_original: boolean
                Retain original column after unpacking
            sample_size: int
                Number of rows to sample before determining columns
            missing: str
                If a value is missing, the value to fill it with
            replace: boolean
                Return new table or replace existing
            max_columns: int
                The maximum number of columns to unpack
        `Returns:`
            None
        """

        # Convert all column values to list to avoid unpack errors
        self.table = petl.convert(
            self.table, column, lambda v: [v]
            if not isinstance(v, list) else v)

        # Find the max number of values in list for all rows
        col_count = 0
        for row in self.cut(column):
            if len(row[column]) > col_count:
                col_count = len(row[column])

        # If max columns provided, set max columns
        if col_count > 0 and max_columns:
            col_count = max_columns

        # Create new column names "COL_01, COL_02"
        new_cols = []
        for i in range(col_count):
            new_cols.append(column + '_' + str(i))

        tbl = petl.unpack(self.table,
                          column,
                          new_cols,
                          include_original=include_original,
                          missing=missing)

        if replace:
            self.table = tbl

        else:
            return tbl
import petl as etl

#Extracting data from example csv file
table1 = etl.fromcsv('example.csv')
print table1
#etl.look(table1)

#Transformation function to be applied on extracted data
table2 = etl.convert(table1, 'foo', 'upper')
table3 = etl.convert(table2, 'bar', int)
table4 = etl.convert(table3, 'baz', float)
table5 = etl.addfield(table4, 'finally', lambda row: row.bar * row.baz)
print table5
#etl.look(table5)

#Writing above ETL pipeline in a functional style
table = (etl.fromcsv('example.csv').convert('foo', 'upper').convert(
    'bar', int).convert('baz', float).addfield('finally',
                                               lambda row: row.bar * row.baz))

table.look()  #look function only displays five rows.
print table

#OOP style programming
l = [['foo', 'bar'], ['a', 1], ['b', 2], ['c', 2]]
table6 = etl.wrap(l)
print table6
Пример #10
0
    def get_events_organization(self,
                                organization_id=None,
                                updated_since=None,
                                timeslot_start=None,
                                timeslot_end=None,
                                timeslots_table=False,
                                max_timeslots=None):
        """
        Fetch all public events for an organization. This includes both events owned
        by the organization (as indicated by the organization field on the event object)
        and events of other organizations promoted by this specified organization.

        .. note::
            API Key Required

        `Args:`
            organization_id: list or int
                Filter events by a single or multiple organization ids
            updated_since: str
                Filter to events updated since given date (ISO Date)
            timeslot_start: str
                Filter by a timeslot start of events using ``>``,``>=``,``<``,``<=``
                operators and ISO date (ex. ``<=2018-12-13 05:00:00PM``)
            timeslot_end: str
                Filter by a timeslot end of events using ``>``,``>=``,``<``,``<=``
                operators and ISO date (ex. ``<=2018-12-13 05:00:00PM``)
            timeslot_table: boolean
                Return timeslots as a separate long table. Useful for extracting
                to databases.
            zipcode: str
                Filter by a Events' Locations' postal code. If present, returns Events
                sorted by distance from zipcode. If present, virtual events will not be returned.
            max_dist: str
                Filter Events' Locations' distance from provided zipcode.
            visibility: str
                Either `PUBLIC` or `PRIVATE`. Private events only return if user is authenticated;
                if `visibility=PRIVATE` and user doesn't have permission, no events returned.
            exclude_full: bool
                If `exclude_full=true`, filter out full Timeslots (and Events if all of an Event's
                Timeslots are full)
            is_virtual: bool
                `is_virtual=false` will return only in-person events, while `is_virtual=true` will
                return only virtual events. If excluded, return virtual and in-person events. Note
                that providing a zipcode also implies `is_virtual=false`.
            event_types:enum
                The type of the event, one of: `CANVASS`, `PHONE_BANK`, `TEXT_BANK`, `MEETING`,
                `COMMUNITY`, `FUNDRAISER`, `MEET_GREET`, `HOUSE_PARTY`, `VOTER_REG`, `TRAINING`,
                `FRIEND_TO_FRIEND_OUTREACH`, `DEBATE_WATCH_PARTY`, `ADVOCACY_CALL`, `OTHER`.
                This list may expand in the future.
            max_timeslots: int
                If not returning a timeslot table, will unpack time slots. If do not
                set this arg, it will add a column for each time slot. The argument
                limits the number of columns and discards any additional timeslots
                after that.

                For example: If there are 20 timeslots associated with your event,
                and you set the max time slots to 5, it will only return the first 5
                time slots as ``time_slot_0``, ``time_slot_1`` etc.

                This is helpful in situations where you have a regular sync
                running and want to ensure that the column headers remain static.

        `Returns`
            Parsons Table or dict or Parsons Tables
                See :ref:`parsons-table` for output options.
        """

        if isinstance(organization_id, (str, int)):
            organization_id = [organization_id]

        args = {
            'organization_id': organization_id,
            'updated_since': date_to_timestamp(updated_since),
            'timeslot_start': self._time_parse(timeslot_start),
            'timeslot_end': self._time_parse(timeslot_end),
        }

        tbl = Table(
            self.request_paginate(self.uri + 'events', args=args, auth=True))

        if tbl.num_rows > 0:

            tbl.unpack_dict('sponsor')
            tbl.unpack_dict('location', prepend=False)
            tbl.unpack_dict('location', prepend=False)  # Intentional duplicate
            tbl.table = petl.convert(tbl.table, 'address_lines',
                                     lambda v: ' '.join(v))

            if timeslots_table:

                timeslots_tbl = tbl.long_table(['id'], 'timeslots', 'event_id')
                return {'events': tbl, 'timeslots': timeslots_tbl}

            else:
                tbl.unpack_list('timeslots',
                                replace=True,
                                max_columns=max_timeslots)
                cols = tbl.columns
                for c in cols:
                    if re.search('timeslots', c, re.IGNORECASE) is not None:
                        tbl.unpack_dict(c)

        return tbl
Пример #11
0
from __future__ import division, print_function, absolute_import


# convert()
###########

import petl as etl
table1 = [['foo', 'bar', 'baz'],
          ['A', '2.4', 12],
          ['B', '5.7', 34],
          ['C', '1.2', 56]]
# using a built-in function:
table2 = etl.convert(table1, 'bar', float)
table2
# using a lambda function::
table3 = etl.convert(table1, 'baz', lambda v: v*2)
table3
# a method of the data value can also be invoked by passing
# the method name
table4 = etl.convert(table1, 'foo', 'lower')
table4
# arguments to the method invocation can also be given
table5 = etl.convert(table1, 'foo', 'replace', 'A', 'AA')
table5
# values can also be translated via a dictionary
table7 = etl.convert(table1, 'foo', {'A': 'Z', 'B': 'Y'})
table7
# the same conversion can be applied to multiple fields
table8 = etl.convert(table1, ('foo', 'bar', 'baz'), str)
table8
# multiple conversions can be specified at the same time
Пример #12
0
def join_annotations_with_gff_features(annot_file,
                                       annot_file_out,
                                       feature_type="CDS",
                                       annot_join="interval",
                                       annot_sep="\t",
                                       gff_files=None,
                                       gff_files_list=None,
                                       max_overlap_only=True):
    """Join features in GFF3 file with annotations.
    Add annotations as attributes of GFF features.
    The way to join with the annotation file must be provided in annot_join, which
    can be either keyed by nucleotide intervals in the first three fields like GFF itself, or
    by feature IDs (if annotation was done for protein sequences - not
    yet implemented).
    Coordinates and strand of each annotation will be replaced with those of overlapping feature,
    if any (can result in more than one record per annotation).
    """
    import petl
    import petlx

    ann_all = petl.io.csv.fromcsv(annot_file, delimiter=annot_sep)
    ann_all = petl.convert(ann_all,
                           ('query_start', 'query_end', 'query_strand'), int)
    ann_all = petl.addcolumn(ann_all, 'ann_rec_ind', range(ann_all.nrows()))

    with petl_opened_file_source(annot_file_out, "w") as annot_out:
        for i_inp, gff_file in enumerate(
                util.glob_files(files_globs=gff_files,
                                files_globs_list=gff_files_list)):
            log.info("Working on feature file {}".format(gff_file))
            feat = petlx.bio.gff3.fromgff3(gff_file)
            feat = petl_fix_gff_coord(feat)

            feat_seqid_set = set(feat["seqid"])

            if feature_type:
                feat = petl.selecteq(feat, 'type', feature_type)

            ann = petl.selectin(ann_all, 'query_id', feat_seqid_set)

            ## somehow we get many ORFs in GFFs (and Genbank files) from both RASTtk and ClovR where one
            ## ORFs ends at the start position of another ORF (and the BLAST match starts at the start of the
            ## second ORF).
            jn = petl.transform.intervals.intervalleftjoin(
                ann,
                feat,
                rstart="start",
                rstop="end",
                lstart="query_start",
                lstop="query_end",
                rkey="seqid",
                lkey="query_id",
                rprefix="feat_")
            jn = petl.addfield(jn,"overlap_len",
                               lambda rec: (min(rec['end'],rec['query_end']) - max(rec['start'],rec['query_start']) + 1) \
                                       if rec['start'] is not None else 0)
            if max_overlap_only:
                jn = petl.groupselectmax(jn,
                                         key="ann_rec_ind",
                                         value="overlap_len")
            _strand_conv = {'+': 1, '-': -1, '.': 0}
            jn = petl.convert(jn,
                              {
                                  'query_start' : lambda v,row: row.start if row.start is not None else v,
                                  'query_end': lambda v,row: row.end if row.end is not None else v,
                                  'query_strand': lambda v,row,_strand_conv=_strand_conv: _strand_conv[row.strand] \
                                      if row.strand is not None else row.query_strand
                              },
                              pass_row=True
                              )
            if i_inp == 0:
                out_func = petl.io.csv.tocsv
            else:
                out_func = petl.io.csv.appendcsv
            out_func(jn, annot_out, delimiter=annot_sep)
Пример #13
0
def parse_duration(inp):
    return etl.convert(inp, 'dur', lambda v: timedelta(milliseconds=v))
Пример #14
0
# convert

table1 = [['foo', 'bar'],
          ['A', '2.4'],
          ['B', '5.7'],
          ['C', '1.2'],
          ['D', '8.3']]
table6 = [['gender', 'age'],
          ['M', 12],
          ['F', 34],
          ['-', 56]]

from petl import convert, look
look(table1)
# using the built-in float function:
table2 = convert(table1, 'bar', float)
look(table2)
# using a lambda function::
table3 = convert(table2, 'bar', lambda v: v**2)
look(table3)    
# a method of the data value can also be invoked by passing the method name
table4 = convert(table1, 'foo', 'lower')
look(table4)
# arguments to the method invocation can also be given
table5 = convert(table4, 'foo', 'replace', 'a', 'aa')
look(table5)
# values can also be translated via a dictionary
look(table6)
table7 = convert(table6, 'gender', {'M': 'male', 'F': 'female'})
look(table7)
Пример #15
0
def parse_datetime(inp, date_fields: List[str]):
    return etl.convert(inp, date_fields, parsedatetime)
international_code = "(+61)"

with open(IN_FILE, 'r') as infile, open(OUT_FILE, "w") as outfile:
    csv_reader = csv.reader(infile)
    writer = csv.writer(outfile)
    headers = next(csv_reader, None)  #skipping header row
    writer.writerow(headers)
    for row in csv_reader:
        number_column = row[5]
        state_column = row[3]
        clean_num = re.sub("\D", "", row[5])[-8:]
        formatted_num = international_code + " " + regional_code[
            state_column] + " " + clean_num
        row[5] = formatted_num
        writer.writerow(row)

services = petl.fromcsv(SERVICES_FILE)
offices = petl.fromcsv(OUT_FILE)
offices = offices.rename({"Contact Name": "Office", "Phone Number": "Phone"})
offices = petl.cutout(offices,"State","Postcode")

locations = petl.fromcsv(LOC_FILE)
locations = locations.rename({"officeID": "OfficeID"})
office_service = petl.join(services, offices, key='OfficeID')

office_service_locations = petl.join(
    office_service, locations, key='OfficeID')

office_service_locations = petl.convert(office_service_locations,'OfficeServiceID',int)
office_service_locations = petl.sort(office_service_locations,'OfficeServiceID')
petl.tocsv(office_service_locations, 'office_service_locations.csv')
Пример #17
0
def transform(table, *fields, **args):
    if len(fields) > 0:
        return etl.convert(table, fields, empty_as_none)
    else:
        return etl.convertall(table, empty_as_none)
Пример #18
0
look(table6)
# using the header keyword argument with two input tables
look(table7)
look(table8)
table9 = cat(table7, table8, header=['A', 'foo', 'B', 'bar', 'C'])
look(table9)

# convert

table1 = [['foo', 'bar', 'baz'], ['A', '2.4', 12], ['B', '5.7', 34],
          ['C', '1.2', 56]]

from petl import convert, look
look(table1)
# using the built-in float function:
table2 = convert(table1, 'bar', float)
look(table2)
# using a lambda function::
table3 = convert(table1, 'baz', lambda v: v * 2)
look(table3)
# a method of the data value can also be invoked by passing the method name
table4 = convert(table1, 'foo', 'lower')
look(table4)
# arguments to the method invocation can also be given
table5 = convert(table1, 'foo', 'replace', 'A', 'AA')
look(table5)
# values can also be translated via a dictionary
table7 = convert(table1, 'foo', {'A': 'Z', 'B': 'Y'})
look(table7)
# the same conversion can be applied to multiple fields
table8 = convert(table1, ('foo', 'bar', 'baz'), unicode)
Пример #19
0
look(table9)


#########
# convert
#########

table1 = [['foo', 'bar', 'baz'],
          ['A', '2.4', 12],
          ['B', '5.7', 34],
          ['C', '1.2', 56]]

from petl import convert, look
look(table1)
# using the built-in float function:
table2 = convert(table1, 'bar', float)
look(table2)
# using a lambda function::
table3 = convert(table1, 'baz', lambda v: v*2)
look(table3)    
# a method of the data value can also be invoked by passing the method name
table4 = convert(table1, 'foo', 'lower')
look(table4)
# arguments to the method invocation can also be given
table5 = convert(table1, 'foo', 'replace', 'A', 'AA')
look(table5)
# values can also be translated via a dictionary
table7 = convert(table1, 'foo', {'A': 'Z', 'B': 'Y'})
look(table7)
# the same conversion can be applied to multiple fields
table8 = convert(table1, ('foo', 'bar', 'baz'), unicode)
Пример #20
0
def cleanFormatTable(table):
	newtable = table
	for h in etl.header(table):
		newtable = etl.convert(table, h, sanitize)
	return newtable
Пример #21
0
        'dbname': 'data_warehouse',
        'db_schema': 'warehouse'
    }
    Model().connection(config)
    number_of_record = 0
    loader = Loader()
    while True:
        data_loaded = load_file('./group_call.json')
        if len(data_loaded) <= number_of_record:
            logging.info("Continue to the next iteration ..")
            sleep(10)
            continue

        number_of_record = len(data_loaded)
        data = etl.fromdicts(data_loaded)
        converted_data = etl.convert(data, 'xml',
                                     lambda r: XMLHelper.xml_to_json(r))
        converted_data = etl.convert(
            converted_data, 'txt',
            lambda r: json.loads(r.strip('group_call_summary#')))
        converted_data = etl.select(
            converted_data, lambda r: len(re.split(r'\/', r['peer'])) == 1)
        converted_data = etl.addfield(converted_data, 'room_id',
                                      lambda r: r['txt'].get('roomId'))
        converted_data = etl.addfield(
            converted_data, 'from_id',
            lambda r: id_str_to_int(r['xml']['from']))
        converted_data = etl.addfield(converted_data, 'to_id',
                                      lambda r: id_str_to_int(r['xml']['to']))
        converted_data = etl.addfield(converted_data, 'message_id',
                                      lambda r: r['xml']['id'])
        converted_data = etl.addfield(
Пример #22
0
from __future__ import division, print_function, absolute_import

example_data = """foo,bar,baz
a,1,3.4
b,2,7.4
c,6,2.2
d,9,8.1
"""
with open('example.csv', 'w') as f:
    f.write(example_data)

import petl as etl
table1 = etl.fromcsv('example.csv')
table2 = etl.convert(table1, 'foo', 'upper')
table3 = etl.convert(table2, 'bar', int)
table4 = etl.convert(table3, 'baz', float)
table5 = etl.addfield(table4, 'quux', lambda row: row.bar * row.baz)
table5

table = (
    etl
    .fromcsv('example.csv')
    .convert('foo', 'upper')
    .convert('bar', int)
    .convert('baz', float)
    .addfield('quux', lambda row: row.bar * row.baz)
)
table

l = [['foo', 'bar'], ['a', 1], ['b', 2], ['c', 2]]
table = etl.wrap(l)
Пример #23
0
if CLEAN_UP:
    table = clean_up(table, 'rcv_nm')
    table = clean_up(table, 'recp_cd')
    table = clean_up(table, 'ins_ind')
    table = clean_up(table, 'geo_ind')
    table = clean_up(table, 'cid')
    table = clean_up(table, 'occ_typ')
    print('TRIMMED HEADERS = ' + str(etl.header(table)))

table = etl.select(table, 'occ_dt', lambda x: x > datetime(2000, 1, 1))
print('ROWS POST YR 2000 = ' + str(etl.nrows(table)))

mine_table = etl.fromcsv('mines.csv', encoding='utf-8')

##handle leading 0's
mine_table = etl.convert(mine_table, 'mine_no', lambda x: str(int(x)))
table = etl.convert(table, 'mine_no', lambda x: str(int(x)))

#MAP mine_no to mine_guid
table = etl.leftjoin(table, mine_table, key='mine_no')
table = clean_up(table, 'mine_no')
#make sure this is 0
if etl.valuecount(table, 'mine_guid', None)[0] > 0:
    print('mine_guid, mine_no pair missing from mines.csv')
    exit(1)

######
print('CONVERT AND RENAME descript1 to recommendation')
table = etl.addfield(table, 'recommendation', lambda x: x['descript1'])
table = clean_up(table, 'descript1')
Пример #24
0
look(table8)
table9 = cat(table7, table8, header=['A', 'foo', 'B', 'bar', 'C'])
look(table9)


# convert

table1 = [['foo', 'bar', 'baz'],
          ['A', '2.4', 12],
          ['B', '5.7', 34],
          ['C', '1.2', 56]]

from petl import convert, look
look(table1)
# using the built-in float function:
table2 = convert(table1, 'bar', float)
look(table2)
# using a lambda function::
table3 = convert(table1, 'baz', lambda v: v*2)
look(table3)    
# a method of the data value can also be invoked by passing the method name
table4 = convert(table1, 'foo', 'lower')
look(table4)
# arguments to the method invocation can also be given
table5 = convert(table1, 'foo', 'replace', 'A', 'AA')
look(table5)
# values can also be translated via a dictionary
table7 = convert(table1, 'foo', {'A': 'Z', 'B': 'Y'})
look(table7)
# the same conversion can be applied to multiple fields
table8 = convert(table1, ('foo', 'bar', 'baz'), unicode)
Пример #25
0
    def get_events(self,
                   organization_id=None,
                   updated_since=None,
                   timeslot_start=None,
                   timeslot_end=None,
                   timeslots_table=False,
                   max_timeslots=None):
        """
        Fetch all public events on the platform.

        `Args:`
            organization_id: list or int
                Filter events by a single or multiple organization ids
            updated_since: str
                Filter to events updated since given date (ISO Date)
            timeslot_start: str
                Filter by a timeslot start of events using ``>``,``>=``,``<``,``<=``
                operators and ISO date (ex. ``<=2018-12-13 05:00:00PM``)
            timeslot_end: str
                Filter by a timeslot end of events using ``>``,``>=``,``<``,``<=``
                operators and ISO date (ex. ``<=2018-12-13 05:00:00PM``)
            timeslot_table: boolean
                Return timeslots as a separate long table. Useful for extracting
                to databases.
            max_timeslots: int
                If not returning a timeslot table, will unpack time slots. If do not
                set this kwarg, it will add a column for each time slot. The argument
                limits the number of columns and discards any additional timeslots
                after that.

                For example: If there are 20 timeslots associated with your event,
                and you set the max time slots to 5, it will only return the first 5
                time slots as ``time_slot_0``, ``time_slot_1`` etc.

                This is helpful in situations where you have a regular sync
                running and want to ensure that the column headers remain static.

        `Returns`
            Parsons Table or dict or Parsons Tables
                See :ref:`parsons-table` for output options.
        """

        if isinstance(organization_id, (str, int)):
            organization_id = [organization_id]

        args = {
            'organization_id': organization_id,
            'updated_since': date_to_timestamp(updated_since),
            'timeslot_start': self._time_parse(timeslot_start),
            'timeslot_end': self._time_parse(timeslot_end)
        }

        tbl = Table(self.request_paginate(self.uri + 'events', args=args))

        if tbl.num_rows > 0:

            tbl.unpack_dict('sponsor')
            tbl.unpack_dict('location', prepend=False)
            tbl.unpack_dict('location', prepend=False)  # Intentional duplicate
            tbl.table = petl.convert(tbl.table, 'address_lines',
                                     lambda v: ' '.join(v))

            if timeslots_table:

                timeslots_tbl = tbl.long_table(['id'], 'timeslots', 'event_id')
                return {'events': tbl, 'timeslots': timeslots_tbl}

            else:
                tbl.unpack_list('timeslots',
                                replace=True,
                                max_columns=max_timeslots)
                cols = tbl.columns
                for c in cols:
                    if re.search('timeslots', c, re.IGNORECASE) is not None:
                        tbl.unpack_dict(c)

        return tbl