Пример #1
0
def main():

	if not table_empty(TABLE_NAME):
		print(f"Table '{TABLE_NAME} already populated, skipping loading of new records")
		return

	path_in = os.path.join(data_dir, FILE_NAME_IN)
	path_out = os.path.join(data_dir, FILE_NAME_OUT)
	statement = load_statement.format(dbname=sql.Identifier(TABLE_NAME))

	with open(path_in) as infile:
		reader = csv.DictReader(infile, delimiter = ';')
		fieldnames = reader.fieldnames

		with open(path_out, 'w', newline='') as csv_file:
			fieldnames = fieldnames
			writer = csv.DictWriter(csv_file, fieldnames, delimiter=';')
			writer.writeheader()
			for row in reader:
				row.update({fieldname: value.strip() for (fieldname, value) in row.items()})
				row.update({fieldname: value.replace('.','') for (fieldname, value) in row.items()})
				writer.writerow(row)

	connection = get_connection()
	cursor = connection.cursor()

	try:
		print("Loading CBS energy use records, this might take a minute or so.")
		cursor.execute(statement, (path_out,))
		cursor.close()
		connection.commit()
		connection.close()
		print("Done.")
	except UndefinedFile:
		print(f"\nError: CBS energy use data file not found.\nExpected file at {path}.")
def delete_labels_without_vbo_id():
    # Around 88 000 energy labels have no associated vbo_id,
    # this is unhelpful in analysis because we can't link them
    # to building years or dwelling types from the BAG.
    # Note: delete_labels_not_in_bag() is not enough since it
    # ignores labels with NULL as vbo_id.
    print('Deleting labels without vbo_id...')
    delete_statement = "DELETE FROM energy_labels WHERE vbo_id IS NULL"
    connection = get_connection()
    cursor = connection.cursor()
    cursor.execute(delete_statement)
    print(f'\tDeleted {cursor.rowcount} labels')

    # After this, we can set the column to not accept
    # NULL values anymore
    not_null_statement = '''
	ALTER TABLE energy_labels
	ALTER COLUMN vbo_id
	SET NOT NULL'''
    print('\tSetting column to NOT NULL')
    cursor.execute(not_null_statement)

    cursor.close()
    connection.commit()
    connection.close()
Пример #3
0
def main():

    if not table_empty(TABLE_NAME):
        print(
            f"Table '{TABLE_NAME}' already populated, skipping loading of new records"
        )
        return

    path = os.path.join(data_dir, FILE_NAME)
    statement = load_statement.format(dbname=sql.Identifier(TABLE_NAME))

    connection = get_connection()
    cursor = connection.cursor()

    try:
        print(
            "Loading RVO Warmtenetten records, this might take a minute or so."
        )
        cursor.execute(statement, (path, ))
        cursor.close()
        connection.commit()
        connection.close()
        print("Done.")
    except UndefinedFile:
        print(
            f"\nError: RVO Warmtenetten data file not found.\nExpected file at {path}."
        )
Пример #4
0
def get_huri_rel_cegsb01_data(std_batches):

    connect = database_utils.get_connection()
    cursor = connect.cursor()

    disease_table = 'hi2018_paper.OMIM_disease_tissue'

    file1 = open(
        '../data/katjas_data/GTEx/expression_datasets/GTExv6_JPnorm_subtypes_no_cell_no_testis.txt',
        'r')
    tissues = [line[:-1] for line in file1.readlines()]
    file1.close()

    analysis1 = Analysis('analysis1')
    analysis1.tissues = tissues
    analysis1.get_std_batch_assay_id_dict(cursor, std_batches)
    analysis1.load_diseases(cursor, disease_table)
    print('Number of diseases loaded:', len(analysis1.diseases))

    analysis1.load_genes(cursor, tissues)
    print('Number of genes loaded:', len(analysis1.genes))

    analysis1.load_disease_alleles(cursor)
    print('Number of alleles loaded:', len(analysis1.alleles))

    analysis1.load_interactions(cursor)
    print('Number of interactions loaded:', len(analysis1.interactions))

    return analysis1
Пример #5
0
def format_bioplex_complexes(infile, outfile):

    connect = database_utils.get_connection()
    df_bp = pandas.read_table(infile)
    query = """select distinct ensembl_gene_id_short,entrez_gene_id
	           from horfeome_annotation_gencode27.gencode2entrez where entrez_gene_id is not NULL"""
    geneid_ensg_map = pandas.read_sql(query, connect)
    df_bp = df_bp.merge(geneid_ensg_map,
                        left_on='GeneID',
                        right_on='entrez_gene_id',
                        how='inner').drop(['entrez_gene_id'], axis=1)
    gene_ids = df_bp['ensembl_gene_id_short'].unique()
    annot_dict = {}
    for gene_id in gene_ids:
        annot_dict[gene_id] = set(
            df_bp.loc[df_bp['ensembl_gene_id_short'] == gene_id,
                      'Cluster Number'].values)
        if len(annot_dict[gene_id]) > 1:
            print gene_id

    target = open(outfile, 'w')
    target.write('ensembl_gene_id\tBioplex_complex_ids\n')
    for gene_id, cluster_ids in annot_dict.items():
        target.write(gene_id + '\t' +
                     '|'.join([str(i)
                               for i in sorted(list(cluster_ids))]) + '\n')
    target.close()
Пример #6
0
def load_cbs_table(table_id, typed_data_set=False):
    '''
	Create a Postgres table with the required structure,
	downloads the data from CBS, and loads the data into
	the Postgres table.
	'''
    try:
        table_name = get_sanitized_cbs_table_title(table_id, typed_data_set)
    except ConnectionError as e:
        print(
            f'\tERROR! Could not load CBS table due to a ConnectionError. Are you connected to the internet? Error:\n\t{e}'
        )
        # do not continue processing
        return

    print(f'Loading CBS table {table_name}')

    # Do not load data
    if table_exists(table_name):
        print('Table already exists, skipping downloading and loading.')

    else:
        print('Creating table...')
        create_table_for_cbs_table(table_id, table_name)

        print('Downloading data...')
        if typed_data_set:
            data = cbsodata.get_meta(table_id, 'TypedDataSet')
        else:
            data = cbsodata.get_data(table_id)

        print('Inserting data...')
        sanitize_key_dict = get_sanitize_key_dict(table_id)
        connection = get_connection()
        cursor = connection.cursor()

        for row in data:
            # Before we can insert the data, we have to manipulate
            # the key so it matches the columns in the created
            # Postgres table, and we might need to trim the white space
            # from the values.
            row_dict = {
                sanitize_key_dict[key]: sanitize_data(value)
                for key, value in row.items()
            }
            # TODO: it's probably more efficient to just pass a list,
            # I think the required keys are always there. And then we can
            # more easily batch these INSERTs.
            insert_dict(table_name, row_dict, cursor)

        cursor.close()
        connection.commit()
        connection.close()

    add_indices(table_name)
    print()
def main():
	path = os.path.join(data_dir, FILE_NAME)
	statement = load_statement.format(dbname=sql.Identifier(TABLE_NAME))

	connection = get_connection()
	cursor = connection.cursor()

	try:
		print("Loading household consumption records")
		cursor.execute(statement, (path,))
	except UndefinedFile:
		print(f"\nError: household consumption data file not found.\nExpected file at {path}.")

	cursor.close()
	connection.commit()
	connection.close()
Пример #8
0
def main():
    try:
        with open(path, 'r') as file:
            header = file.readline().strip()
            column_names = header.split(';')
    except FileNotFoundError:
        print(
            f"Error: WoON survey data file not found.\nExpected file at {path}."
        )
        return

    columns = [(sanitize_column_name(column_name), 'text')
               for column_name in column_names]

    print('Creating table...')
    create_table(TABLE_NAME, columns)

    if not table_empty(TABLE_NAME):
        print(
            f"Table '{TABLE_NAME}' already populated, skipping loading of new records"
        )
        return

    connection = get_connection()
    cursor = connection.cursor()
    # TODO: make this idempotent somehow?
    statement = load_statement.format(table_name=sql.Identifier(TABLE_NAME))
    try:
        print("Loading WoON survey data...")
        cursor.execute(statement, (path, ))

    except UndefinedFile:
        print(
            f"Error: WoON survey data file not found.\nExpected file at {path}."
        )

    columns_to_alter = ['ew_huis', 'ew_pers']
    for column in columns_to_alter:
        alter_column_to_number(cursor, column)

    print("Committing...")
    cursor.close()
    connection.commit()
    connection.close()
def load_energy_labels_data():

    connection = get_connection()
    cursor = connection.cursor()
    path = os.path.join(data_dir, FILENAME)

    insert_statement = sql.SQL('INSERT INTO {table_name} VALUES %s').format(
        table_name=sql.Identifier(env['EP_ONLINE_DBNAME']))

    # Bind variables so we can pass a unary function to fast_iter.
    bounded_handle_element = partial(handle_element,
                                     cursor=cursor,
                                     insert_statement=insert_statement)
    context = ET.iterparse(path, tag='Pandcertificaat')
    fast_iter(context, bounded_handle_element)

    cursor.close()
    connection.commit()
    connection.close()
def delete_labels_not_in_bag():
    # Sidenote: my naive first query was:
    # DELETE FROM energy_labels WHERE vbo_id NOT IN (SELECT vbo_id FROM bag)
    # but that takes ages!
    # This with the JOIN method seems backwards ("JOIN and then check for NULL
    # to see if the JOIN was succesful?!"), but is much faster.
    print('\tDeleting labels not in BAG...')
    delete_statement = '''
	DELETE FROM energy_labels
	WHERE vbo_id IN (
		SELECT energy_labels.vbo_id
		FROM energy_labels
		LEFT JOIN bag
		ON energy_labels.vbo_id = bag.vbo_id
		WHERE bag.vbo_id IS NULL
	)'''
    connection = get_connection()
    cursor = connection.cursor()
    cursor.execute(delete_statement)
    print(f'\tDeleted {cursor.rowcount} labels')
    cursor.close()
    connection.commit()
    connection.close()
Пример #11
0
				outstring = outstring[:-1]
			else:
				outstring = str(edge[attr])
			target.write('\t' + outstring)
		target.write('\n')

	target.close()


if __name__ == '__main__':

	network_name = sys.argv[1]
	tissues = sys.argv[2].split(',')
	output_folder = sys.argv[3]

	connect = database_utils.get_connection()
	cursor = connect.cursor()

	file1 = open(config.NT_cond_file,'r')
	all_tissues = file1.readlines()
	file1.close()
	all_tissues = [c[:-1] for c in all_tissues]

	if len(tissues) == 1 and tissues[0] == 'all':
		tissues = all_tissues

	print('load gene expression data')
	GTEx = pandas.read_table(config.NT_expr_file,index_col=0)
	TiPmatrix = pandas.read_table(config.NT_pref_expr_file,index_col=0)
	GTEx_tissues = TiPmatrix.columns
	for tissue in GTEx_tissues: