def get_flanking_positions(obj_tt, genomic_range, flank_type): """ Find the position flanking the genomic alteration -if mutation, then complete_rf = True for codons flanking codon affected -if insertion, then complete_rf = False, need to retrieve codons immediately before & after genomic alteration -if deletion, then complete_rf = False, need to retrieve codons immediately before & after genomic alteration Args: -obj_tt -genomic_range = string that is genomic range of genomic alteration () -alt_type = integer that is the "flank_type", either to stay on the position in 'genomic_range' or to get the position before & after the genomic alteration (-1 & +1 for before & after genomic alteration, respectively) -0 = stay on genomic position of genomic alteration -should do this for SNVs and events where genomic alteration does not alter the reading frame. -1 = retrieve the position before (-1) & after (+1) -should do this for frameshifting events as the reading frame has been altered Returns: returns a string """ hash_pos = Isoform.split_genome_pos(genomic_range) if flank_type == 0: return hash_pos else: return { 'chrom': hash_pos['chrom'], 'start': hash_pos['start'] - 1, 'end': hash_pos['end'] + 1 }
def create_obj_tt(isoform_id, genome_pos): """ Creates an instance of TranslateTranscript Args: -isoform_id = string that is the isoform, usually in the form of an Ensembl ID (e.g. ENST000..) -row = from pandas Dataframe, a row from the file contain mutation position """ ##TEST:: print "MAIN: start of cott: isoform_id = ", isoform_id db_type = 2 #this means the database is Ensembl hash_gp = Isoform.split_genome_pos(genome_pos) #hash_gp = hash genome pos hash_pos = { 'chrom': 'chr' + str(hash_gp['chrom']), 'pos_oi': hash_gp['start'] } iso_sj = IsoformSJ(db_type, isoform_id, [], -10, hash_pos, False, 0, True) canon_transcript = iso_sj.create_canon_transcript(False, False) obj_tt = TranslateTranscript(canon_transcript, iso_sj, DIR_GENOME, {}) return obj_tt
print "Body: at row ", i, " - taking a ", sec_break, " second break (Apparently Ensembl REST API only allows 15 requests per second)" time.sleep(sec_break) print "------------ Algorithm: 180825_ProcessGenomeAlt_V2.py ------------" """ Algorithm: Determine the consequence of a genomic alteration & records it in file. This also retrieves the nucleotides before & after the position of the genomic alteration PROTOCOL: open file that contains genomic alterations -> calculate the consequence of each mutation by performing a VEP request -> record the results of the request (position, reading frame, relative CDS position, ) #LATER: retrieve the AA sequence before & after the genomic alteration """ start_time = time.time() g = Genome('sqlite:////tmp/hg19_v2.db') Isoform.set_cruzdb(g) """ mode_calc_exp -> will be used to determine if the gene expression should be calculated or not, where 0 = do not calculate gene expression, 1 = calculate gene expression. """ mode_calc_exp = 0 #use this if I do not want to consider thresholding by gene expression percentile. Also, this may include all occurrences of mutations, including non-coding? # mode_calc_exp = 1 #use this to consider thresholding by gene expression percentile #retrieve user-inputted parameters arg_date_output = sys.argv[1] thres_express = int( sys.argv[2]) #threshold for the gene expression percentile to accept path_velip_file = sys.argv[3] output_dir = sys.argv[4] is_seq_WGS = sys.argv[5] ##MAY DELETE, I DO NOT NEED THIS BECAUSE OF "path_velip_file"
# from SVSv7 import Isoform, IsoformSJ, TranscribeTranscript, TranslateTranscript, EnsemblVEP from SVSv7 import Isoform, SimpleNeoepitopeAllV2, SimpleNeoepitopeIsoformV2 from mokhaPy import mokhaPy #Constants - directories DIR_PROJ = "/home/mokha/Documents/Krauthammer_Lab" DIR_CURR = DIR_PROJ + "/PythonClasses/SVSv7" DIR_DATA = DIR_CURR + "/TestData" DIR_RESULTS = DIR_CURR + "/TestResults" DIR_GENOME = DIR_PROJ + '/ArchiveData/hg19.fa' #directory for samtool-indexed genome print "------------ TDD: 171108_SimpleNeoepV2_NMD.py ------------" g = Genome( 'sqlite:////tmp/hg19_v2.db' ) Isoform.set_cruzdb( g ) ##IMPORTANT TEST - THE "X" AMINO ACID # #simulate point mutation - the genomic range for this "X" amino acid is "X:153146127-153146128" # isoform_id = "ENST00000452593" # # genomic_range = "X:153146127-153146128" #this is the range of the "X" amino acid # genomic_range = "X:153146127-153146127" # orig = None # alt = "T" ##MUTATIONS - minus gene # #simulate point mutation - RESULT: codon = aAt/at & amino acids = N/X # isoform_id = "ENST00000376887" # # genomic_range = "13:95815411-95815411" # genomic_range = "13:95953564-95953564" # orig = None