예제 #1
0
import pandas as pd
from db_engine import create
from arrays import make_array
from numpy.linalg import norm

engine = create()  # database connection created using SQLAlchemy library

# Count distance on genome

engine.execute(
    "ALTER TABLE sahlen_promoter_enhancer ADD COLUMN genome_distance INT")
engine.execute(
    "UPDATE sahlen_promoter_enhancer SET genome_distance = "
    "ABS(Fragment_end_coordinate-(Fragment_end_coordinate-Fragment_start_coordinate)/2 "  # enhancer middle
    "- Promoter_TSS) WHERE Promoter_chr=Fragment_chromosome")

# Count degree of sequences

engine.execute(
    "CREATE TABLE sahlen_frag_per_promo "
    "SELECT Promoter_chr, Promoter_TSS, new_promo_start, new_promo_end, "
    "count(distinct Fragment_chromosome, new_enh_start, new_enh_end) "
    "AS degree "
    "FROM sahlen_promoter_enhancer "
    "GROUP BY Promoter_chr, new_promo_start, new_promo_end")

engine.execute(
    "CREATE TABLE sahlen_promo_per_frag "
    "SELECT Fragment_chromosome, Fragment_start_coordinate, Fragment_end_coordinate, "
    "new_enh_start, new_enh_end, "
    "count(distinct Promoter_chr, new_promo_start, new_promo_end) "
예제 #2
0
import pandas as pd
from numpy.linalg import norm
from arrays import make_array
from db_engine import create

engine = create()


engine.execute("CREATE TABLE sahlen_promo_per_promo SELECT `Promoter chr`, new_promo_start, new_promo_end, "
               "count(distinct `Promoter chr.1`, new_promo2_start, new_promo2_end) as "
               "num from sahlen_promoter_promoter group by `Promoter chr`, new_promo_start, new_promo_end", con=engine)

engine.execute("CREATE TABLE sahlen_range_promo_per_promo "
               "SELECT MAX(`Promoter TSS.1`)-MIN(`Promoter TSS.1`) "
               "AS promo_range, `Promoter TSS`, `Promoter chr` "
               "FROM sahlen_promoter_promoter "
               "GROUP BY `Promoter chr`, `Promoter TSS`")


dataset = "sahlen"
path = "/home/kinga/Dokumenty/Studia/licencjat_old/data/"

engine.execute("ALTER TABLE sahlen_promoter_promoter ADD kmer_distance FLOAT")

sahlen = pd.read_sql("SELECT new_promo_start, new_promo_end, new_promo2_start, new_promo2_end, "
                     "`Promoter chr`, `Promoter chr.1` from "
                     "sahlen_promoter_promoter "
                     # where kmer_distance is null or kmer_distance=0
                     , con=engine)

sahlen.columns = sahlen.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('.', '')