import string import pandas import numpy as np import matplotlib.pyplot as plt import seaborn as sns import itertools import math import time import sys import personal_popgen import re import random from itertools import izip fasta_DPlex = personal_popgen.fasta_dict( '/proj/uppstore2017185/b2014034_nobackup/Venkat/Monarch_stuff/genome/Danaus_plexippus_v3_-_scaffolds.fa' ) fasta_stm163_1 = personal_popgen.fasta_dict( '/proj/uppstore2017185/b2014034_nobackup/Venkat/Monarch_stuff/Scripts/LDhelmet/stm163_1.fa' ) fasta_stm146_1 = personal_popgen.fasta_dict( '/proj/uppstore2017185/b2014034_nobackup/Venkat/Monarch_stuff/Scripts/LDhelmet/stm146_1.fa' ) fasta_T9_1 = personal_popgen.fasta_dict( '/proj/uppstore2017185/b2014034_nobackup/Venkat/Monarch_stuff/Scripts/LDhelmet/T9_1.fa' ) fasta_T14_1 = personal_popgen.fasta_dict( '/proj/uppstore2017185/b2014034_nobackup/Venkat/Monarch_stuff/Scripts/LDhelmet/T14_1.fa' ) fasta_NJ203_1 = personal_popgen.fasta_dict(
import seaborn as sns import itertools import math import time import sys import personal_popgen from itertools import izip sns.set(font_scale=1.5) sns.set_style("whitegrid", {'axes.grid': False}) pwd = '/proj/uppstore2017185/b2014034_nobackup/Venkat/Monarch_stuff/temp/' os.chdir(pwd) fasta = personal_popgen.fasta_dict( '/proj/uppstore2017185/b2014034_nobackup/Venkat/Monarch_stuff/genome/Danaus_plexippus_v3_-_scaffolds.fa' ) chromosome = pandas.DataFrame(columns=['CHROM', 'BIN_START', 'BIN_END']) for i in fasta.keys(): temp = {} if len(fasta[i]) >= 10000: list_start = range(1, len(fasta[i]), 10000) list_start.pop() list_end = range(10000, len(fasta[i]), 10000) else: list_start = [1] list_end = [10000] chrom = [i] * len(list_end) temp['CHROM'] = pandas.Series(chrom)
if type_of_sites == 'all' df_pass=functools.reduce(lambda x, y: pandas.merge(x, y, on=['CHROM','POS'], how='inner'), list_1) print ("merged all") del list_1 df_pass['BIN_START']=(np.floor(df_pass['POS']/args.length)*args.length)+1 df_pass['BIN_END']=df_pass['BIN_START']+(args.length-1) df_merge = pandas.merge(df_pass, chromosome, on=['CHROM','BIN_START','BIN_END'],how='inner') df_merge = df_merge.query('BIN_START <= POS and BIN_END >= POS') print ("computed windows") del df_pass out=pandas.DataFrame({output+'_N_sites' :df_merge.groupby(['CHROM','BIN_START','BIN_END']).size()}).reset_index() print ("computed output") out.to_csv('/scratch/vt20265/bam_files/'+output+'_'+str(part)+'_N_sites') print ("completed") elif type_of_sites == 'intergenic' fasta_DPlex=personal_popgen.fasta_dict(refrence_file) codon_df_1=pandas.read_csv('codon_df_1.csv',names=['CHROM','POS'], sep=' ',header=None) codon_df_1=codon_df_1[codon_df_1['CHROM'].isin(temp)] codon_df_2=pandas.read_csv('codon_df_2.csv',names=['CHROM','POS'], sep=' ',header=None) codon_df_2=codon_df_2[codon_df_2['CHROM'].isin(temp)] codon_df_3=pandas.read_csv('codon_df_3.csv',names=['CHROM','POS'], sep=' ',header=None) codon_df_3=codon_df_3[codon_df_3['CHROM'].isin(temp)] introns=pandas.read_csv('introns.csv',names=['CHROM','POS'], sep=' ',header=None) introns=introns[introns['CHROM'].isin(temp)] intergenic_exclude=pandas.concat([codon_df_1[['CHROM','POS']], codon_df_2[['CHROM','POS']], codon_df_3[['CHROM','POS']], introns[['CHROM','POS']]], ignore_index=True) intergenic_exclude=intergenic_exclude[['CHROM','POS']] intergenic=pandas.DataFrame(columns=['CHROM','POS']) for i in temp: chrom_len=len(fasta_DPlex[i]) all_site=list(range(1,chrom_len+1)) exlude_sites=list(intergenic_exclude[(intergenic_exclude.CHROM == i )]['POS'])
# temp['minor_freq']=[allele[min(allele, key=lambda k: allele[k])]] # temp['major_allele']=[max(allele, key=lambda k: allele[k])] # temp['major_freq']=[allele[max(allele, key=lambda k: allele[k])]] # temp_df = pandas.DataFrame(temp) # new_output=new_output.append(temp_df, ignore_index=True) #new_output['BIN_START']=(np.floor(new_output['POS']/100000)*100000)+1 #new_output['BIN_END']=new_output['BIN_START']+(100000-1) return new_output args = get_args() sns.set(font_scale=1.5) sns.set_style("whitegrid", {'axes.grid': False}) CDS_fasta = personal_popgen.fasta_dict( '/work/smalab/Venkat/genome/Danaus_plexippus_v3_-_cds.fa') goods = output_good_sites_fequency(args.freq, args.chr) #goods=output_good_sites_fequency("/proj/uppstore2017185/b2014034_nobackup/Venkat/Monarch_stuff/temp/ALL_induveduals.frq",86) goods = goods[(goods.major_allele != '*') & (goods.minor_allele != '*')] goods['POS'] = goods['POS'].astype(int) fixed = goods[(goods['minor_freq'] == 0.0)] fixed = fixed.append( goods[(goods['minor_freq'] >= round(float(2.0 / (args.chr)), 8))], ignore_index=True) goods = fixed goods['CODE'] = '' goods['CODE'][(goods['minor_freq'] >= round(float(2.0 / (args.chr)), 8)) & ( (goods['minor_allele'] + goods['major_allele'] == 'AG') | (goods['minor_allele'] + goods['major_allele'] == 'GA'))] = 'R' goods['CODE'][(goods['minor_freq'] >= round(float(2.0 / (args.chr)), 8)) & ( (goods['minor_allele'] + goods['major_allele'] == 'CT')
import statsmodels.api as sm import seaborn as sns import itertools import math import time import sys import personal_popgen from Bio.Seq import Seq from Bio.Alphabet import generic_dna, generic_protein from Bio.SeqUtils import GC import scipy #get genes that are in one orf, start to stop codon, fasta_DPlex = personal_popgen.fasta_dict( '/proj/uppstore2017185/b2014034_nobackup/Venkat/Monarch_stuff/temp/ALL_induveduals.CDS.fasta' ) fasta_DErip = personal_popgen.fasta_dict( '/proj/uppstore2017185/b2014034_nobackup/Venkat/Monarch_stuff/Outgroups/danaus_eresimus.CDS.fasta' ) good_genes = [] realign = [] ###get gene wize allignments for i in fasta_DPlex.keys(): if i in (fasta_DErip.keys()) and (len(fasta_DErip[i]) > 0.0): Ns = float(str(fasta_DErip[i]).count('N')) / len(fasta_DErip[i]) if Ns < 0.35: if len(fasta_DPlex[i]) != len(fasta_DErip[i]): realign.append(i[0:-3]) elif len(fasta_DPlex[i]) == len(fasta_DErip[i]):
from Bio.Alphabet import IUPAC from Bio.Seq import MutableSeq import argparse from Bio import codonalign from Bio.codonalign.codonseq import cal_dn_ds from Bio.codonalign.codonseq import default_codon_table from Bio.codonalign import CodonSeq from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna pwd='/proj/uppstore2017185/b2014034_nobackup/POPULATION_RESEQ/dnds/temp_files/' os.chdir(pwd) fasta_1=personal_popgen.fasta_dict('sinapis.CDS.fasta') fasta_2=personal_popgen.fasta_dict('spanish_reali.CDS.fasta') fasta_3=personal_popgen.fasta_dict('juvernica.CDS.fasta') #DM_genes=pandas.read_csv("/proj/uppstore2017185/b2014034_nobackup/POPULATION_RESEQ/final_VCF/pop_stats_shared_private_2/gene_ortholog_flybase/overlap_new.csv", names=['DM_gene','LS_gene'], sep=' ') for i in fasta_1.keys(): F = open("/proj/uppstore2017185/b2014034_nobackup/POPULATION_RESEQ/dnds/gene_allignments_species/"+str(i)+'.fasta',"w") #F.write(">"+str(gene_name)+'_L_Sin'+'\n') F.write(">"+'L_Sin'+'\n') F.write(str(fasta_1[i])+'\n') #F.write(">"+str(gene_name)+'_L_Rea'+'\n')
import math import time import sys import personal_popgen from pandas.tools.plotting import scatter_matrix import Bio.Data.CodonTable from Bio.Seq import Seq from Bio.Alphabet import IUPAC head_stuff=['scaffold','source','feature','start','end','extra','-','indexing','infor'] #CDSs=pandas.read_table("/home/venkat/bin/snpgenie/test/",skiprows=1, names=head_stuff) CDS_fasta=personal_popgen.fasta_dict('/proj/uppstore2017185/b2014034/NBIS_annotation_leptidea/fasta/cds.fa') annotation=pandas.read_table("/proj/uppstore2017185/b2014034/NBIS_annotation_leptidea/gff/gene-builds/leptidea_sinapis_rc1.gff",skiprows=1, names=head_stuff) annotation=annotation[['scaffold','source','feature','start','end','-','indexing','infor']] annotation.loc[(annotation['indexing']=='1') & (annotation['-']=='+'), 'start']=annotation.loc[(annotation['indexing']=='1') & (annotation['-']=='+'), 'start']-1 annotation.loc[(annotation['indexing']=='2') & (annotation['-']=='+'), 'start']=annotation.loc[(annotation['indexing']=='2') & (annotation['-']=='+'), 'start']-1 annotation.loc[(annotation['indexing']=='1') & (annotation['-']=='-'), 'start']=annotation.loc[(annotation['indexing']=='1') & (annotation['-']=='-'), 'start']-1 annotation.loc[(annotation['indexing']=='2') & (annotation['-']=='-'), 'start']=annotation.loc[(annotation['indexing']=='2') & (annotation['-']=='-'), 'start']-1 annotation.loc[(annotation['indexing']=='0') & (annotation['-']=='-'), 'start']=annotation.loc[(annotation['indexing']=='0') & (annotation['-']=='-'), 'start']-1 annotation.loc[(annotation['indexing']=='0') & (annotation['-']=='+'), 'start']=annotation.loc[(annotation['indexing']=='0') & (annotation['-']=='+'), 'start']-1
import personal_popgen from pandas.tools.plotting import scatter_matrix import Bio.Data.CodonTable from Bio.Seq import Seq from Bio.Alphabet import IUPAC sns.set(font_scale=1.5) sns.set_style("whitegrid", {'axes.grid' : False}) pwd='/proj/uppstore2017185/b2014034_nobackup/POPULATION_RESEQ/final_VCF/pop_stats_shared_private_2/' os.chdir(pwd) fasta=personal_popgen.fasta_dict('/proj/uppstore2017185/b2014034_nobackup/GENOME_ASSEMBLY/assembly_updates/v1.4/N.Backstrom_leptidea.scf.1.4.fasta') CDS_fasta=personal_popgen.fasta_dict('/proj/uppstore2017185/b2014034/NBIS_annotation_leptidea/fasta/cds.fa') part=str(sys.argv[2]) #part='part_1' def get_intron(CDS): df1=CDS df1.index = range(1, 2*len(df1)+1, 2) df2 = df1.reindex(index=range(2*len(df1))) df2 = df2.iloc[1:] df2.scaffold = df2.scaffold.ffill() df2.source = df2.source.ffill()