import string
import pandas
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import math
import time
import sys
import personal_popgen
import re
import random
from itertools import izip

fasta_DPlex = personal_popgen.fasta_dict(
    '/proj/uppstore2017185/b2014034_nobackup/Venkat/Monarch_stuff/genome/Danaus_plexippus_v3_-_scaffolds.fa'
)

fasta_stm163_1 = personal_popgen.fasta_dict(
    '/proj/uppstore2017185/b2014034_nobackup/Venkat/Monarch_stuff/Scripts/LDhelmet/stm163_1.fa'
)
fasta_stm146_1 = personal_popgen.fasta_dict(
    '/proj/uppstore2017185/b2014034_nobackup/Venkat/Monarch_stuff/Scripts/LDhelmet/stm146_1.fa'
)
fasta_T9_1 = personal_popgen.fasta_dict(
    '/proj/uppstore2017185/b2014034_nobackup/Venkat/Monarch_stuff/Scripts/LDhelmet/T9_1.fa'
)
fasta_T14_1 = personal_popgen.fasta_dict(
    '/proj/uppstore2017185/b2014034_nobackup/Venkat/Monarch_stuff/Scripts/LDhelmet/T14_1.fa'
)
fasta_NJ203_1 = personal_popgen.fasta_dict(
import seaborn as sns
import itertools
import math
import time
import sys
import personal_popgen
from itertools import izip

sns.set(font_scale=1.5)
sns.set_style("whitegrid", {'axes.grid': False})

pwd = '/proj/uppstore2017185/b2014034_nobackup/Venkat/Monarch_stuff/temp/'
os.chdir(pwd)

fasta = personal_popgen.fasta_dict(
    '/proj/uppstore2017185/b2014034_nobackup/Venkat/Monarch_stuff/genome/Danaus_plexippus_v3_-_scaffolds.fa'
)

chromosome = pandas.DataFrame(columns=['CHROM', 'BIN_START', 'BIN_END'])

for i in fasta.keys():
    temp = {}
    if len(fasta[i]) >= 10000:
        list_start = range(1, len(fasta[i]), 10000)
        list_start.pop()
        list_end = range(10000, len(fasta[i]), 10000)
    else:
        list_start = [1]
        list_end = [10000]
    chrom = [i] * len(list_end)
    temp['CHROM'] = pandas.Series(chrom)
if type_of_sites == 'all'
    df_pass=functools.reduce(lambda x, y: pandas.merge(x, y, on=['CHROM','POS'], how='inner'), list_1)
    print ("merged all")
    del list_1
    df_pass['BIN_START']=(np.floor(df_pass['POS']/args.length)*args.length)+1
    df_pass['BIN_END']=df_pass['BIN_START']+(args.length-1)
    df_merge = pandas.merge(df_pass, chromosome, on=['CHROM','BIN_START','BIN_END'],how='inner')
    df_merge = df_merge.query('BIN_START <= POS and BIN_END >= POS')
    print ("computed windows")
    del df_pass
    out=pandas.DataFrame({output+'_N_sites' :df_merge.groupby(['CHROM','BIN_START','BIN_END']).size()}).reset_index()
    print ("computed output")
    out.to_csv('/scratch/vt20265/bam_files/'+output+'_'+str(part)+'_N_sites')
    print ("completed")
elif type_of_sites == 'intergenic'
    fasta_DPlex=personal_popgen.fasta_dict(refrence_file)
    codon_df_1=pandas.read_csv('codon_df_1.csv',names=['CHROM','POS'], sep=' ',header=None)
    codon_df_1=codon_df_1[codon_df_1['CHROM'].isin(temp)]
    codon_df_2=pandas.read_csv('codon_df_2.csv',names=['CHROM','POS'], sep=' ',header=None)
    codon_df_2=codon_df_2[codon_df_2['CHROM'].isin(temp)]
    codon_df_3=pandas.read_csv('codon_df_3.csv',names=['CHROM','POS'], sep=' ',header=None)
    codon_df_3=codon_df_3[codon_df_3['CHROM'].isin(temp)]
    introns=pandas.read_csv('introns.csv',names=['CHROM','POS'], sep=' ',header=None)
    introns=introns[introns['CHROM'].isin(temp)]
    intergenic_exclude=pandas.concat([codon_df_1[['CHROM','POS']], codon_df_2[['CHROM','POS']], codon_df_3[['CHROM','POS']], introns[['CHROM','POS']]], ignore_index=True)
    intergenic_exclude=intergenic_exclude[['CHROM','POS']]
    intergenic=pandas.DataFrame(columns=['CHROM','POS'])
    for i in temp:
        chrom_len=len(fasta_DPlex[i])
        all_site=list(range(1,chrom_len+1))
        exlude_sites=list(intergenic_exclude[(intergenic_exclude.CHROM == i )]['POS'])
示例#4
0
    #           temp['minor_freq']=[allele[min(allele, key=lambda k: allele[k])]]
    #           temp['major_allele']=[max(allele, key=lambda k: allele[k])]
    #           temp['major_freq']=[allele[max(allele, key=lambda k: allele[k])]]
    #           temp_df = pandas.DataFrame(temp)
    #           new_output=new_output.append(temp_df, ignore_index=True)
    #new_output['BIN_START']=(np.floor(new_output['POS']/100000)*100000)+1
    #new_output['BIN_END']=new_output['BIN_START']+(100000-1)
    return new_output


args = get_args()

sns.set(font_scale=1.5)
sns.set_style("whitegrid", {'axes.grid': False})

CDS_fasta = personal_popgen.fasta_dict(
    '/work/smalab/Venkat/genome/Danaus_plexippus_v3_-_cds.fa')
goods = output_good_sites_fequency(args.freq, args.chr)
#goods=output_good_sites_fequency("/proj/uppstore2017185/b2014034_nobackup/Venkat/Monarch_stuff/temp/ALL_induveduals.frq",86)
goods = goods[(goods.major_allele != '*') & (goods.minor_allele != '*')]
goods['POS'] = goods['POS'].astype(int)
fixed = goods[(goods['minor_freq'] == 0.0)]
fixed = fixed.append(
    goods[(goods['minor_freq'] >= round(float(2.0 / (args.chr)), 8))],
    ignore_index=True)
goods = fixed
goods['CODE'] = ''
goods['CODE'][(goods['minor_freq'] >= round(float(2.0 / (args.chr)), 8)) & (
    (goods['minor_allele'] + goods['major_allele'] == 'AG')
    | (goods['minor_allele'] + goods['major_allele'] == 'GA'))] = 'R'
goods['CODE'][(goods['minor_freq'] >= round(float(2.0 / (args.chr)), 8)) & (
    (goods['minor_allele'] + goods['major_allele'] == 'CT')
示例#5
0
import statsmodels.api as sm
import seaborn as sns
import itertools
import math
import time
import sys
import personal_popgen
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna, generic_protein
from Bio.SeqUtils import GC
import scipy

#get genes that are in one orf, start to stop codon,

fasta_DPlex = personal_popgen.fasta_dict(
    '/proj/uppstore2017185/b2014034_nobackup/Venkat/Monarch_stuff/temp/ALL_induveduals.CDS.fasta'
)
fasta_DErip = personal_popgen.fasta_dict(
    '/proj/uppstore2017185/b2014034_nobackup/Venkat/Monarch_stuff/Outgroups/danaus_eresimus.CDS.fasta'
)

good_genes = []
realign = []
###get gene wize allignments
for i in fasta_DPlex.keys():
    if i in (fasta_DErip.keys()) and (len(fasta_DErip[i]) > 0.0):
        Ns = float(str(fasta_DErip[i]).count('N')) / len(fasta_DErip[i])
        if Ns < 0.35:
            if len(fasta_DPlex[i]) != len(fasta_DErip[i]):
                realign.append(i[0:-3])
            elif len(fasta_DPlex[i]) == len(fasta_DErip[i]):
from Bio.Alphabet import IUPAC
from Bio.Seq import MutableSeq
import argparse
from Bio import codonalign
from Bio.codonalign.codonseq import cal_dn_ds
from Bio.codonalign.codonseq import default_codon_table
from Bio.codonalign import CodonSeq
from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna


pwd='/proj/uppstore2017185/b2014034_nobackup/POPULATION_RESEQ/dnds/temp_files/'
os.chdir(pwd)



fasta_1=personal_popgen.fasta_dict('sinapis.CDS.fasta')
fasta_2=personal_popgen.fasta_dict('spanish_reali.CDS.fasta')
fasta_3=personal_popgen.fasta_dict('juvernica.CDS.fasta')




#DM_genes=pandas.read_csv("/proj/uppstore2017185/b2014034_nobackup/POPULATION_RESEQ/final_VCF/pop_stats_shared_private_2/gene_ortholog_flybase/overlap_new.csv", names=['DM_gene','LS_gene'], sep=' ')


for i in fasta_1.keys():
		F = open("/proj/uppstore2017185/b2014034_nobackup/POPULATION_RESEQ/dnds/gene_allignments_species/"+str(i)+'.fasta',"w") 
		#F.write(">"+str(gene_name)+'_L_Sin'+'\n')
		F.write(">"+'L_Sin'+'\n')
		F.write(str(fasta_1[i])+'\n')
		#F.write(">"+str(gene_name)+'_L_Rea'+'\n')
示例#7
0
import math
import time
import sys
import personal_popgen
from pandas.tools.plotting import scatter_matrix
import Bio.Data.CodonTable
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC




head_stuff=['scaffold','source','feature','start','end','extra','-','indexing','infor']
#CDSs=pandas.read_table("/home/venkat/bin/snpgenie/test/",skiprows=1, names=head_stuff)

CDS_fasta=personal_popgen.fasta_dict('/proj/uppstore2017185/b2014034/NBIS_annotation_leptidea/fasta/cds.fa')



annotation=pandas.read_table("/proj/uppstore2017185/b2014034/NBIS_annotation_leptidea/gff/gene-builds/leptidea_sinapis_rc1.gff",skiprows=1, names=head_stuff)
annotation=annotation[['scaffold','source','feature','start','end','-','indexing','infor']]

annotation.loc[(annotation['indexing']=='1') & (annotation['-']=='+'), 'start']=annotation.loc[(annotation['indexing']=='1') & (annotation['-']=='+'), 'start']-1
annotation.loc[(annotation['indexing']=='2') & (annotation['-']=='+'), 'start']=annotation.loc[(annotation['indexing']=='2') & (annotation['-']=='+'), 'start']-1

annotation.loc[(annotation['indexing']=='1') & (annotation['-']=='-'), 'start']=annotation.loc[(annotation['indexing']=='1') & (annotation['-']=='-'), 'start']-1
annotation.loc[(annotation['indexing']=='2') & (annotation['-']=='-'), 'start']=annotation.loc[(annotation['indexing']=='2') & (annotation['-']=='-'), 'start']-1

annotation.loc[(annotation['indexing']=='0')  & (annotation['-']=='-'), 'start']=annotation.loc[(annotation['indexing']=='0') & (annotation['-']=='-'), 'start']-1
annotation.loc[(annotation['indexing']=='0')  & (annotation['-']=='+'), 'start']=annotation.loc[(annotation['indexing']=='0') & (annotation['-']=='+'), 'start']-1
示例#8
0
import personal_popgen
from pandas.tools.plotting import scatter_matrix
import Bio.Data.CodonTable
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC




sns.set(font_scale=1.5)
sns.set_style("whitegrid", {'axes.grid' : False})

pwd='/proj/uppstore2017185/b2014034_nobackup/POPULATION_RESEQ/final_VCF/pop_stats_shared_private_2/'
os.chdir(pwd)

fasta=personal_popgen.fasta_dict('/proj/uppstore2017185/b2014034_nobackup/GENOME_ASSEMBLY/assembly_updates/v1.4/N.Backstrom_leptidea.scf.1.4.fasta')

CDS_fasta=personal_popgen.fasta_dict('/proj/uppstore2017185/b2014034/NBIS_annotation_leptidea/fasta/cds.fa')

part=str(sys.argv[2])

#part='part_1'


def get_intron(CDS):
	df1=CDS
	df1.index = range(1, 2*len(df1)+1, 2)
	df2 = df1.reindex(index=range(2*len(df1)))
	df2 = df2.iloc[1:]
	df2.scaffold = df2.scaffold.ffill()
	df2.source = df2.source.ffill()