look(table2) # search a specific field table3 = search(table1, 'foo', '.g.') look(table3) # addcolumn table1 = (('foo', 'bar'), ('A', 1), ('B', 2)) from petl import addcolumn, look look(table1) col = [True, False] table2 = addcolumn(table1, col, 'baz') look(table2) # lookupjoin table1 = (('id', 'color', 'cost'), (1, 'blue', 12), (2, 'red', 8), (3, 'purple', 4)) table2 = (('id', 'shape', 'size'), (1, 'circle', 'big'), (1, 'circle', 'small'), (2, 'square', 'tiny'), (2, 'square', 'big'), (3, 'ellipse', 'small'), (3, 'ellipse', 'tiny'))
# movie personnel d_movie_personnel = etl.cut(n_table, 'imdb_name_id', 'name', 'birth_name') # title d_title = etl.cut(m_table, 'imdb_title_id', 'title', 'original_title') # genre d_genre = etl.distinct(etl.cut(m_table, 'genre')) rows = etl.nrows(d_genre) generated =[] # print(rows) for i in range(rows): uuid = out_cursor.execute('SELECT UUID();') uuid = out_cursor.fetchone()[0] generated.append(uuid) d_genre = etl.addcolumn(d_genre, 'genre_id', generated) # date d_date = etl.distinct(etl.cut(m_table, 'year', 'date_published')) rows = etl.nrows(d_date) generated =[] for i in range(rows): uuid = out_cursor.execute('SELECT UUID();') uuid = out_cursor.fetchone()[0] generated.append(uuid) d_date = etl.addcolumn(d_date, 'date_id', generated) # country d_country = etl.distinct(etl.cut(m_table, 'country')) rows = etl.nrows(d_country) generated =[]
def add_columns(dataset, column_name, column_data): new_dataset = addcolumn(dataset, column_name, column_data) return new_dataset
def join_annotations_with_gff_features(annot_file, annot_file_out, feature_type="CDS", annot_join="interval", annot_sep="\t", gff_files=None, gff_files_list=None, max_overlap_only=True): """Join features in GFF3 file with annotations. Add annotations as attributes of GFF features. The way to join with the annotation file must be provided in annot_join, which can be either keyed by nucleotide intervals in the first three fields like GFF itself, or by feature IDs (if annotation was done for protein sequences - not yet implemented). Coordinates and strand of each annotation will be replaced with those of overlapping feature, if any (can result in more than one record per annotation). """ import petl import petlx ann_all = petl.io.csv.fromcsv(annot_file, delimiter=annot_sep) ann_all = petl.convert(ann_all, ('query_start', 'query_end', 'query_strand'), int) ann_all = petl.addcolumn(ann_all, 'ann_rec_ind', range(ann_all.nrows())) with petl_opened_file_source(annot_file_out, "w") as annot_out: for i_inp, gff_file in enumerate( util.glob_files(files_globs=gff_files, files_globs_list=gff_files_list)): log.info("Working on feature file {}".format(gff_file)) feat = petlx.bio.gff3.fromgff3(gff_file) feat = petl_fix_gff_coord(feat) feat_seqid_set = set(feat["seqid"]) if feature_type: feat = petl.selecteq(feat, 'type', feature_type) ann = petl.selectin(ann_all, 'query_id', feat_seqid_set) ## somehow we get many ORFs in GFFs (and Genbank files) from both RASTtk and ClovR where one ## ORFs ends at the start position of another ORF (and the BLAST match starts at the start of the ## second ORF). jn = petl.transform.intervals.intervalleftjoin( ann, feat, rstart="start", rstop="end", lstart="query_start", lstop="query_end", rkey="seqid", lkey="query_id", rprefix="feat_") jn = petl.addfield(jn,"overlap_len", lambda rec: (min(rec['end'],rec['query_end']) - max(rec['start'],rec['query_start']) + 1) \ if rec['start'] is not None else 0) if max_overlap_only: jn = petl.groupselectmax(jn, key="ann_rec_ind", value="overlap_len") _strand_conv = {'+': 1, '-': -1, '.': 0} jn = petl.convert(jn, { 'query_start' : lambda v,row: row.start if row.start is not None else v, 'query_end': lambda v,row: row.end if row.end is not None else v, 'query_strand': lambda v,row,_strand_conv=_strand_conv: _strand_conv[row.strand] \ if row.strand is not None else row.query_strand }, pass_row=True ) if i_inp == 0: out_func = petl.io.csv.tocsv else: out_func = petl.io.csv.appendcsv out_func(jn, annot_out, delimiter=annot_sep)
['A', 9], ['C', 2], ['F', 1]] table2 = etl.addrownumbers(table1) table2 # addcolumn() ############# import petl as etl table1 = [['foo', 'bar'], ['A', 1], ['B', 2]] col = [True, False] table2 = etl.addcolumn(table1, 'baz', col) table2 # addfieldusingcontext() ######################## import petl as etl table1 = [['foo', 'bar'], ['A', 1], ['B', 4], ['C', 5], ['D', 9]] def upstream(prv, cur, nxt): if prv is None: return None
mysql_engine.connect().execute('SET SQL_MODE=ANSI_QUOTES') # connect to sqlite sqlite_engine = create_engine('sqlite:///' + sqlite_path) # migrate tables for table in tables: print(table) data = etl.fromdb(sqlite_engine, 'select * from ' + table) if table == 'employees': recs = etl.records(data) emails = [] for rec in recs: emails.append(rec['first_name'] + '.' + rec['last_name'] + '@mycompany.com') data2 = etl.addcolumn(data, 'email', emails) else: data2 = data etl.todb(data2, mysql_engine, table, create=True) # load CSV file data = etl.fromcsv(source=socialmedia_csv) recs = etl.records(data) # determine employee numbers empnos = [] for rec in recs: sub = etl.fromdb( sqlite_engine, "SELECT emp_no FROM employees " + "where last_name = '" + rec['last_name'] + "' " + "and first_name = '" + rec['first_name'] + "' " + "and birth_date = '" + rec['birth_date'] + "' " +