示例#1
0
look(table2)
# search a specific field
table3 = search(table1, 'foo', '.g.')
look(table3)


# addcolumn

table1 = (('foo', 'bar'),
          ('A', 1),
          ('B', 2))

from petl import addcolumn, look
look(table1)
col = [True, False]
table2 = addcolumn(table1, col, 'baz')
look(table2)


# lookupjoin
table1 = (('id', 'color', 'cost'), 
          (1, 'blue', 12), 
          (2, 'red', 8), 
          (3, 'purple', 4))
table2 = (('id', 'shape', 'size'), 
          (1, 'circle', 'big'), 
          (1, 'circle', 'small'), 
          (2, 'square', 'tiny'), 
          (2, 'square', 'big'), 
          (3, 'ellipse', 'small'), 
          (3, 'ellipse', 'tiny'))
示例#2
0
文件: examples.py 项目: datamade/petl
look(table2)
# search a specific field
table3 = search(table1, 'foo', '.g.')
look(table3)


# addcolumn

table1 = (('foo', 'bar'),
          ('A', 1),
          ('B', 2))

from petl import addcolumn, look
look(table1)
col = [True, False]
table2 = addcolumn(table1, col, 'baz')
look(table2)


# lookupjoin
table1 = (('id', 'color', 'cost'), 
          (1, 'blue', 12), 
          (2, 'red', 8), 
          (3, 'purple', 4))
table2 = (('id', 'shape', 'size'), 
          (1, 'circle', 'big'), 
          (1, 'circle', 'small'), 
          (2, 'square', 'tiny'), 
          (2, 'square', 'big'), 
          (3, 'ellipse', 'small'), 
          (3, 'ellipse', 'tiny'))
示例#3
0
# movie personnel
d_movie_personnel = etl.cut(n_table, 'imdb_name_id', 'name', 'birth_name')

# title
d_title = etl.cut(m_table, 'imdb_title_id', 'title', 'original_title')

# genre
d_genre = etl.distinct(etl.cut(m_table, 'genre'))
rows = etl.nrows(d_genre)
generated =[] 
# print(rows)
for i in range(rows):
    uuid = out_cursor.execute('SELECT UUID();')
    uuid = out_cursor.fetchone()[0]
    generated.append(uuid)
d_genre = etl.addcolumn(d_genre, 'genre_id', generated)

# date
d_date = etl.distinct(etl.cut(m_table, 'year', 'date_published'))
rows = etl.nrows(d_date)
generated =[]
for i in range(rows):
    uuid = out_cursor.execute('SELECT UUID();')
    uuid = out_cursor.fetchone()[0]
    generated.append(uuid)
d_date = etl.addcolumn(d_date, 'date_id', generated)

# country 
d_country = etl.distinct(etl.cut(m_table, 'country'))
rows = etl.nrows(d_country)
generated =[]
def add_columns(dataset, column_name, column_data):

    new_dataset = addcolumn(dataset, column_name, column_data)

    return new_dataset
示例#5
0
def join_annotations_with_gff_features(annot_file,
                                       annot_file_out,
                                       feature_type="CDS",
                                       annot_join="interval",
                                       annot_sep="\t",
                                       gff_files=None,
                                       gff_files_list=None,
                                       max_overlap_only=True):
    """Join features in GFF3 file with annotations.
    Add annotations as attributes of GFF features.
    The way to join with the annotation file must be provided in annot_join, which
    can be either keyed by nucleotide intervals in the first three fields like GFF itself, or
    by feature IDs (if annotation was done for protein sequences - not
    yet implemented).
    Coordinates and strand of each annotation will be replaced with those of overlapping feature,
    if any (can result in more than one record per annotation).
    """
    import petl
    import petlx

    ann_all = petl.io.csv.fromcsv(annot_file, delimiter=annot_sep)
    ann_all = petl.convert(ann_all,
                           ('query_start', 'query_end', 'query_strand'), int)
    ann_all = petl.addcolumn(ann_all, 'ann_rec_ind', range(ann_all.nrows()))

    with petl_opened_file_source(annot_file_out, "w") as annot_out:
        for i_inp, gff_file in enumerate(
                util.glob_files(files_globs=gff_files,
                                files_globs_list=gff_files_list)):
            log.info("Working on feature file {}".format(gff_file))
            feat = petlx.bio.gff3.fromgff3(gff_file)
            feat = petl_fix_gff_coord(feat)

            feat_seqid_set = set(feat["seqid"])

            if feature_type:
                feat = petl.selecteq(feat, 'type', feature_type)

            ann = petl.selectin(ann_all, 'query_id', feat_seqid_set)

            ## somehow we get many ORFs in GFFs (and Genbank files) from both RASTtk and ClovR where one
            ## ORFs ends at the start position of another ORF (and the BLAST match starts at the start of the
            ## second ORF).
            jn = petl.transform.intervals.intervalleftjoin(
                ann,
                feat,
                rstart="start",
                rstop="end",
                lstart="query_start",
                lstop="query_end",
                rkey="seqid",
                lkey="query_id",
                rprefix="feat_")
            jn = petl.addfield(jn,"overlap_len",
                               lambda rec: (min(rec['end'],rec['query_end']) - max(rec['start'],rec['query_start']) + 1) \
                                       if rec['start'] is not None else 0)
            if max_overlap_only:
                jn = petl.groupselectmax(jn,
                                         key="ann_rec_ind",
                                         value="overlap_len")
            _strand_conv = {'+': 1, '-': -1, '.': 0}
            jn = petl.convert(jn,
                              {
                                  'query_start' : lambda v,row: row.start if row.start is not None else v,
                                  'query_end': lambda v,row: row.end if row.end is not None else v,
                                  'query_strand': lambda v,row,_strand_conv=_strand_conv: _strand_conv[row.strand] \
                                      if row.strand is not None else row.query_strand
                              },
                              pass_row=True
                              )
            if i_inp == 0:
                out_func = petl.io.csv.tocsv
            else:
                out_func = petl.io.csv.appendcsv
            out_func(jn, annot_out, delimiter=annot_sep)
示例#6
0
文件: basics.py 项目: DeanWay/petl
          ['A', 9],
          ['C', 2],
          ['F', 1]]
table2 = etl.addrownumbers(table1)
table2


# addcolumn()
#############

import petl as etl
table1 = [['foo', 'bar'],
          ['A', 1],
          ['B', 2]]
col = [True, False]
table2 = etl.addcolumn(table1, 'baz', col)
table2


# addfieldusingcontext()
########################

import petl as etl
table1 = [['foo', 'bar'],
          ['A', 1],
          ['B', 4],
          ['C', 5],
          ['D', 9]]
def upstream(prv, cur, nxt):
    if prv is None:
        return None
示例#7
0
mysql_engine.connect().execute('SET SQL_MODE=ANSI_QUOTES')

# connect to sqlite
sqlite_engine = create_engine('sqlite:///' + sqlite_path)

# migrate tables
for table in tables:
    print(table)
    data = etl.fromdb(sqlite_engine, 'select * from ' + table)
    if table == 'employees':
        recs = etl.records(data)
        emails = []
        for rec in recs:
            emails.append(rec['first_name'] + '.' + rec['last_name'] +
                          '@mycompany.com')
        data2 = etl.addcolumn(data, 'email', emails)
    else:
        data2 = data
    etl.todb(data2, mysql_engine, table, create=True)

# load CSV file
data = etl.fromcsv(source=socialmedia_csv)
recs = etl.records(data)
# determine employee numbers
empnos = []
for rec in recs:
    sub = etl.fromdb(
        sqlite_engine,
        "SELECT emp_no FROM employees " + "where last_name = '" +
        rec['last_name'] + "' " + "and first_name = '" + rec['first_name'] +
        "' " + "and birth_date = '" + rec['birth_date'] + "' " +