def _translate_from_prep(cls, chr_list, flip): ''' Take uploaded values and streamline into ints and sequence ends. ''' # Some files come in with the reads flipped. # Allow for insertion of flipped strands. if flip: strand_char = '+' else: strand_char = '-' for chr_id in chr_list: update_query = """ INSERT INTO "{0}_{chr_id}" (chromosome_id, strand, "start", "end", start_end, refseq) SELECT * FROM ( SELECT {chr_id}, (CASE WHEN prep.strand_char = '{strand_char}' THEN 1 ELSE 0 END), prep."start", (prep."start" + char_length(prep.sequence_matched)), int8range(prep."start", (prep."start" + char_length(prep.sequence_matched)), '[]'), NULL::boolean FROM "{1}" prep JOIN "{2}" chr ON chr.name = prep.chromosome WHERE chr.id = {chr_id}) derived; """.format(cls._meta.db_table, cls.prep_table, Chromosome._meta.db_table, strand_char=strand_char, chr_id=chr_id) execute_query(update_query)
def create_parent_table(cls, name): ''' Create table that will be used for these tags, dynamically named. ''' name = 'tag_' + name cls.set_table_name(name) table_sql = """ CREATE TABLE "{0}" ( id integer NOT NULL, chromosome_id integer default NULL, strand smallint default NULL, "start" bigint, "end" bigint, start_end int8range, refseq boolean default false ); CREATE SEQUENCE "{0}_id_seq" START WITH 1 INCREMENT BY 1 NO MINVALUE NO MAXVALUE CACHE 1; ALTER SEQUENCE "{0}_id_seq" OWNED BY "{0}".id; ALTER TABLE "{0}" ALTER COLUMN id SET DEFAULT nextval('"{0}_id_seq"'::regclass); ALTER TABLE ONLY "{0}" ADD CONSTRAINT {1}_pkey PRIMARY KEY (id); """.format(cls._meta.db_table, cls.name) execute_query(table_sql) cls.table_created = True
def create_schema(): try: execute_query(""" CREATE SCHEMA "{0}"; GRANT Create,Usage ON SCHEMA "{0}" TO "{1}"; """.format(current_settings.CURRENT_SCHEMA, current_settings.DATABASES['default']['USER'])) except utils.DatabaseError: _print('Schema already created.')
def _add_indices(cls, chr_list): for chr_id in chr_list: update_query = """ CREATE INDEX {0}_{chr_id}_pkey_idx ON "{1}_{chr_id}" USING btree (id); CREATE INDEX {0}_{chr_id}_chr_idx ON "{1}_{chr_id}" USING btree (chromosome_id); CREATE INDEX {0}_{chr_id}_strand_idx ON "{1}_{chr_id}" USING btree (strand); CREATE INDEX {0}_{chr_id}_start_end_idx ON "{1}_{chr_id}" USING gist (start_end); ANALYZE "{1}_{chr_id}"; """.format(cls.name, cls._meta.db_table, chr_id=chr_id) execute_query(update_query)
def delete_prep_columns(cls): ''' .. warning:: DELETES the associated sequence and strand_char columns to conserve space. ''' table_sql = """ ALTER TABLE "{0}" DROP COLUMN sequence_matched; ALTER TABLE "{0}" DROP COLUMN strand_char; ALTER TABLE "{0}" DROP COLUMN chromosome; """.format(cls._meta.db_table) execute_query(table_sql) cls.table_created = False
def _set_scores(cls, chr_list): try: for chr_id in chr_list: print('Scoring transcripts for chromosome {}'.format(chr_id)) query = """ SELECT atlas_{0}_{1}{2}.calculate_scores({chr_id}); SELECT atlas_{0}_{1}{2}.calculate_standard_error({chr_id}); """.format(current_settings.GENOME, current_settings.CELL_TYPE.lower(), current_settings.STAGING, chr_id=chr_id) execute_query(query) except Exception as e: raise e
def create_partition_tables(cls): ''' Can't be multiprocessed; too many attempts to ANALYZE at once. ''' for chr_id in current_settings.GENOME_CHOICES[current_settings.GENOME]['chromosomes']: table_sql = """ CREATE TABLE "{0}_{1}" ( CHECK ( chromosome_id = {1} ) ) INHERITS ("{0}");""".format(cls._meta.db_table, chr_id) execute_query(table_sql) trigger_sql = ''' CREATE OR REPLACE FUNCTION {0}.atlas_tag_insert_trigger() RETURNS TRIGGER AS $$ DECLARE refseq_string text; start_end_string text; BEGIN IF NEW.refseq IS NULL THEN refseq_string := 'NULL'; ELSE refseq_string := NEW.refseq::text; END IF; IF NEW.start_end IS NULL THEN start_end_string := 'NULL'; ELSE start_end_string := 'int8range(' || quote_literal(NEW.start) || ',' || quote_literal(NEW."end") || ', ''[]'')'; END IF; EXECUTE 'INSERT INTO "{1}_' || NEW.chromosome_id || '" VALUES (' || quote_literal(NEW.id) || ',' || quote_literal(NEW.chromosome_id) || ',' || quote_literal(NEW.strand) || ',' || quote_literal(NEW.start) || ',' || quote_literal(NEW."end") || ',' || start_end_string || ',' || refseq_string || ')'; RETURN NULL; END; $$ LANGUAGE 'plpgsql'; -- Trigger function for inserts on main table CREATE TRIGGER atlas_tag_trigger BEFORE INSERT ON "{1}" FOR EACH ROW EXECUTE PROCEDURE {0}.atlas_tag_insert_trigger(); '''.format(current_settings.CURRENT_SCHEMA, cls._meta.db_table,) execute_query(trigger_sql)
def _draw_transcript_edges(cls, chr_list): try: for chr_id in chr_list: print( 'Drawing edges for transcripts for chr_id {}'.format(chr_id)) query = """ SELECT atlas_{0}_{1}{2}.draw_transcript_edges({chr_id},{3},{4}); """.format(current_settings.GENOME, current_settings.CELL_TYPE.lower(), current_settings.STAGING, MIN_ONE_RUN_TAGS, current_settings.MAX_EDGE, chr_id=chr_id) execute_query(query) except Exception as e: raise e
def create_prep_table(cls, name): ''' Create table that will be used for to upload preparatory tag information, dynamically named. ''' cls.set_prep_table('prep_' + name) table_sql = """ CREATE TABLE "{}" ( strand_char character(1) default NULL, chromosome varchar(50), "start" bigint, sequence_matched varchar(100) ); """.format(cls.prep_table) execute_query(table_sql)
def _add_transcripts_from_groseq(cls, chr_list, sequencing_run): try: for chr_id in chr_list: print('Adding transcripts for chromosome {}'.format(chr_id)) query = """ SELECT atlas_{}_{}_prep.save_transcripts_from_sequencing_run( {}, {}, '{}', {}, {}, {}, {}, {}, NULL, NULL); """.format(current_settings.GENOME, current_settings.CELL_TYPE.lower(), sequencing_run.id, chr_id, sequencing_run.source_table.strip(), MAX_GAP, TAG_EXTENSION, current_settings.MAX_EDGE, EDGE_SCALING_FACTOR, current_settings.DENSITY_MULTIPLIER) execute_query(query) except Exception as e: raise e
def _set_refseq(cls, chr_list): ''' Set refseq status for segmentation during stitching later on by overlapping with consolidated refseq transcripts. ''' for chr_id in chr_list: print('Setting Refseq status for chromosome {0}'.format(chr_id)) update_query = """ UPDATE "{0}_{1}" tag SET refseq = true FROM atlas_{2}_refseq_prep.atlas_transcript_{1} ref WHERE ref.start_end && tag.start_end AND ref.strand = tag.strand AND tag.refseq IS NULL; UPDATE "{0}_{1}" tag SET refseq = false WHERE refseq IS NULL; """.format(cls._meta.db_table, chr_id, current_settings.GENOME) execute_query(update_query)
def create_table(cls, name): ''' Create table that will be used for these peaks, dynamically named. ''' cls.set_table_name('peak_' + name) table_sql = """ CREATE TABLE "{0}" ( id int4, chromosome_id int4, "start" int8, "end" int8, "strand" int2, start_end int8range, "length" int4, summit int8, tag_count decimal(10,2) default NULL, raw_tag_count decimal(10,2) default NULL, score decimal(8,2) default NULL, p_value decimal(6,4) default NULL, p_value_exp int4 default NULL, log_ten_p_value decimal(10,2) default NULL, fold_enrichment decimal(10,2) default NULL, fdr_threshold decimal(6,4) default NULL, fdr_threshold_exp int4 default NULL ); CREATE SEQUENCE "{0}_id_seq" START WITH 1 INCREMENT BY 1 NO MINVALUE NO MAXVALUE CACHE 1; ALTER SEQUENCE "{0}_id_seq" OWNED BY "{0}".id; ALTER TABLE "{0}" ALTER COLUMN id SET DEFAULT nextval('"{0}_id_seq"'::regclass); ALTER TABLE ONLY "{0}" ADD CONSTRAINT {1}_pkey PRIMARY KEY (id); """.format(cls._meta.db_table, cls.name) execute_query(table_sql) cls.table_created = True
def _set_density(cls, chr_list, allow_extended_gaps=True, extension_percent='.2', null_only=True): ''' Force reset average tags for prep DB. ''' try: for chr_id in chr_list: print( 'Setting density for transcripts for chr_id {}'.format(chr_id)) query = """ SELECT atlas_{}_{}_prep.set_density({},{},{},{},{},{},{}); """.format(current_settings.GENOME, current_settings.CELL_TYPE.lower(), chr_id, current_settings.MAX_EDGE, EDGE_SCALING_FACTOR, current_settings.DENSITY_MULTIPLIER, allow_extended_gaps and 'true' or 'false', extension_percent, null_only and 'true' or 'false') execute_query(query) except Exception as e: raise e
def _stitch_together_transcripts(cls, chr_list, allow_extended_gaps=True, extension_percent='.2', set_density=False, null_only=True): ''' This is tag-level agnostic, stitching based on gap size alone. ''' try: for chr_id in chr_list: print( 'Stitching together transcripts for chr_id {}'.format(chr_id)) query = """ SELECT atlas_{}_{}_prep.save_transcripts_from_existing({}, {}); """.format(current_settings.GENOME, current_settings.CELL_TYPE.lower(), chr_id, MAX_STITCHING_GAP) execute_query(query) if set_density: print( 'Setting density for transcripts for chr_id {}'.format(chr_id)) query = """ SELECT atlas_{}_{}_prep.set_density({},{},{},{},{},{},{}); """.format(current_settings.GENOME, current_settings.CELL_TYPE.lower(), chr_id, current_settings.MAX_EDGE, EDGE_SCALING_FACTOR, current_settings.DENSITY_MULTIPLIER, allow_extended_gaps and 'true' or 'false', extension_percent, null_only and 'true' or 'false') execute_query(query) except Exception as e: raise e
make_option('-g', '--genome', action='store', type='string', dest='genome', default='mm9', help='Currently supported: mm9, dm3'), make_option('-c', '--cell_type', action='store', type='string', dest='cell_type', help='Cell type for this run?'), make_option('-f', '--final', action='store_true', dest='final', default=False, help='Generate only the final schema and tables, without the prep schema?'), make_option('-p', '--prep', action='store_true', dest='prep', default=False, help='Generate only the prep schema and tables, without the final schema?'), ] if __name__ == '__main__': django_setup() parser = SetUpDatabaseParser() options, args = parser.parse_args() genome = parser.set_genome(options) cell_type, cell_base = parser.set_cell(options) generator = AtlasSqlGenerator(cell_type=cell_type, genome=genome) print('Creating database schema and tables...') q = (options.final and generator.final_set()) \ or (options.prep and generator.prep_set()) \ or generator.all_sql() execute_query(q)
make_option('-c', '--cell_type', action='store', type='string', dest='cell_type', help='Cell type for this run?'), make_option('--schema_only', action='store_true', dest='schema_only', default=False, help='Only create schema and stop before importing all the data?'), ] if __name__ == '__main__': django_setup() parser = SetUpDatabaseParser() options, args = parser.parse_args() cell_type, cell_base = parser.set_cell(options) genome = parser.set_genome(options) generator = GenomeResourcesSqlGenerator(cell_type=cell_type, genome=genome) print('Creating genome resources database schema and tables...') q = generator.all_sql() execute_query(q) if not options.schema_only and genome in current_settings.GENOME_CHOICES.keys(): q_set = generator.fill_tables() for q in q_set: if q: execute_query(q) # And cleanup the import tables: execute_query(generator.cleanup())
def add_indices(cls): update_query = """ CREATE INDEX {0}_chr_idx ON "{1}" USING btree (chromosome_id); CREATE INDEX {0}_start_end_idx ON "{1}" USING gist (start_end); """.format(cls.name, cls._meta.db_table) execute_query(update_query)
def delete_prep_table(cls): table_sql = """ DROP TABLE "{0}" CASCADE; """.format(cls.prep_table) execute_query(table_sql)