def test_write_gff3(): gff3_df = generate_gff3_df() gff3_df.write_gff3('temp.gff') gff_content = open('temp.gff').read() assert gff_content == written_gff read_gff_output = gff3pd.read_gff3('temp.gff') read_in_file = gff3pd.read_gff3('fixtures/test_file.gff') pd.testing.assert_frame_equal(read_in_file._df, read_gff_output._df)
def __init__(self, gff_file, resultant_file, fasta_txt_file, qvalue_threshold, dataset_id, genome_name): self.dataset_id = dataset_id self.genome_name = genome_name self.QValue_threshold = qvalue_threshold # read annotation file. self.gff_file = gff_file self.annotation = gffpd.read_gff3(gff_file) self.resultant_df = pd.read_csv(resultant_file, sep='\t', float_precision='round_trip') col_of_interest = ['ProteinName', 'Sequence'] self.fasta_df = pd.read_csv(fasta_txt_file, sep='\t')[col_of_interest] cols_to_rename = {'ProteinName': 'Protein'} self.fasta_df.rename(columns=cols_to_rename, inplace=True) self.fasta_df['Index'] = self.fasta_df.index + 1 # saved to served multiple calls self.query_8_result = None self.peptide_report = None # building log excel files: file_loc = os.path.join(*self.gff_file.split('/')[:-1]) self.writer = pd.ExcelWriter(f"/{file_loc}/{DATA_LOG}.xlsx", engine='xlsxwriter') # log dataframe. indexes = [f'query_{idx}' for idx in range(23)] indexes.extend(['Peptide_Report', 'Protein_Report', 'qc_metrics']) cols = ['#row', '#col', 'col_name'] self.log_df = pd.DataFrame(columns=cols, index=indexes)
def main(): parser = argparse.ArgumentParser( description="Generate outputs from ssearch36 output files") parser.add_argument("-s", "--ssearch36-in", dest="ssearch_in") parser.add_argument("-g", "--genome-in", dest="genome_in") parser.add_argument("-o", "--output-dir", dest="output_dir") parser.add_argument( "--gff-yeast", dest="gff_yeast", default= "/media/theboocock/data/Dropbox/PHDTHESIS/projects/gal_final_github/data/annotations/saccharomyces_cerevisiae.gff" ) args = parser.parse_args() ssearch36_in = args.ssearch_in annot = args.gff_yeast genome_in = args.genome_in output_dir = args.output_dir out_prefix = os.path.basename(genome_in) out_new_all = out_prefix.split(".ss")[0] ssearch_df = get_ssearch36(args.ssearch_in) gff_yeast = gffpd.read_gff3(annot) gff_yeast_annot = (gff_yeast.attributes_to_columns()) # Extract orfs that are only Verified and genes. gff_yeast_annot = gff_yeast_annot[ (gff_yeast_annot["orf_classification"] == "Verified") & (gff_yeast_annot["type"] == "gene")] lengths = gff_yeast_annot["end"] - gff_yeast_annot["start"] + 1 gff_yeast_annot["lengths"] = (lengths) ssearch_in = pandas.read_csv(ssearch36_in, sep="\t", header=None) (ssearch_in.columns) = [ "gene", "strain", "percentage_match", "3", "4", "5", "ref_start", "ref_end", "start", "end", "E", "11" ] with open(os.path.join(output_dir, out_new_all + ".strict.fasta"), "w") as out_strict: with open(os.path.join(output_dir, out_new_all + ".permissive.fasta"), "w") as out_f: for gene in gff_yeast_annot["Name"]: gene = str(gene) if (any(gff_yeast_annot["Name"].isin([gene]))): gene_length = int(gff_yeast_annot[gff_yeast_annot["Name"] == gene]["lengths"]) #f_gene = partial(get_gene_from_ssearch36_script, gene, genome_in, gene_length) gene_out = get_gene_from_ssearch36_script( genome_in, ssearch_in, gene, gene_length) #gene_out = [f_gene(df)] print(gene_out) gene_strict = gene_out[0][1] gene_perm = gene_out[1][1] if gene_strict is not None: out_strict.write(">" + gene + "\n" + gene_strict + "\n") if gene_perm is not None: out_f.write(">" + gene + "\n" + gene_perm + "\n")
def find_overlap(pairs, file_output): with open(file_output, "w+") as w: Hmer_total=[] df_Counts_Total=pd.DataFrame(columns=['Sequence','Length','Base','InORF(T/F)']) for i in range(len(pairs)): print(pairs[i][0]) print(pairs[i][1]) #create data frames from txt - hmer.py output and gff prodigal output files dfH = pd.read_table(pairs[i][0], names=["Sequence","Acession_ID","Start","End","Length","Base","Info"], sep='\t') ORFLarge = gffpd.read_gff3(pairs[i][1]) #read gff makes a strange data frame structure - convert to standard Pandas data frame dfORFLarge = pd.DataFrame(ORFLarge.df) #remove excess info from gff data frame selected_columns= dfORFLarge[["seq_id","start","end"]] dfORF = selected_columns.copy() scaffolds=dfH['Sequence'].nunique() if scaffolds == 1: #using numpy array to combine the subtraction of Hmer from ORF for both start and end arrayH=np.array(dfH[['Start','End']]) arrayORF=np.array(dfORF[['start','end']]) #create a 3D array which is sets=numbers of hmers, rows=#open reading frames columns=2 (start,end) combined=(arrayORF - arrayH[:,np.newaxis]).reshape(-1,arrayORF.shape[0],2) #Find and denote true or false for when True = (negative -Start), (positive -end) T_F=np.where((combined[:,:,0] <0)&(combined[:,:,1]>0),True,False).reshape(-1,1,combined.shape[1]) #if set contains True = True if not = False, reduce to one value per set T_F_clean = np.any((T_F[:,0,:]==True),axis=1).reshape(-1,1) T_F_dataframe=pd.DataFrame(data=T_F_clean,columns=['In_ORF']) #add the in ORF column to the hmer data frames dfH['InORF(T/F)']=T_F_dataframe dfCounts=dfH[['Sequence','Acession_ID','Length','Base','InORF(T/F)','Info']].value_counts().sort_index().reset_index(name="Counts") Hmer_total.append(dfCounts) i+=1 else: print("Skip") i+=1 Counts=pd.concat(Hmer_total) Counts.to_csv(w, sep="\t", index=False)
def f_import_transform_gtf(global_para): df_gff = gffpd.read_gff3(global_para.gtf_file) df_gff_all = df_gff.df df_gff_all = df_gff_all.astype({"seq_id":"str"}) df_gff_cds = df_gff_all.query('type == @global_para.feature_type_cds') if global_para.source != "all": tmp_list_source = global_para.source.split(',') df_gff_cds = df_gff_cds.query('source in @tmp_list_source') df_gff_cds_expand = f_gtf_attributes_expand(df_gff_cds) df_gff_cds_expand.rename(columns = {global_para.featureid_gene_id :'gene_id', global_para.featureid_gene_name :'gene_name', global_para.featureid_transcript_id :'transcript_id'},inplace = True) return df_gff_cds_expand
def main(map_file, gtf_file): df_gtf = gffpd.read_gff3(map_file).df.reset_index() df_gtf['transcript_id'] = df_gtf['phase'].apply( lambda x: dict([e.split('=') for e in x.split(';')]).get('Note')) for row in df_gtf.itertuples(): uniprot = row.index tid = row.transcript_id if tid is None: continue print(uniprot, tid) append_line(gtf_file, tid, uniprot)
sep='\t') expect_combined = [] table_df = pd.DataFrame(columns=[ "Sequence", "Length", "BP_inside", "Expected", "Hmer_inside(BP)", "Hmer_outside(BP)", "Observed" ]) with open("/Volumes/ubdata/mmcfad/NCBI_Genomes/Output_files/Code_content.txt", "w+") as w: for i in range(len(files_orf)): print(files_orf[i]) #Determining the expected amounts inside vs outside open reading frames orf = gffpd.read_gff3(files_orf[i]) df_orfLarge = pd.DataFrame(orf.df) selected_columns = df_orfLarge[["seq_id", "start", "end"]] ORF = selected_columns.copy() scaffolds = ORF['seq_id'].nunique() if scaffolds == 1: #Length of the genome from the header of the gff file header = orf.header split = header.split(" ") print(split) select_split = split[-1].replace(";", "=").split("=") print(select_split) code = select_split[4]
def test_to_gff3(): assert gff_content == written_gff read_gff_output = gff3pd.read_gff3('temp.gff') read_in_file = gff3pd.read_gff3('fixtures/test_file.gff') pd.testing.assert_frame_equal(read_in_file.df, read_gff_output.df)
def generate_gff3_df(): read_in_file = gff3pd.read_gff3('fixtures/test_file.gff') return read_in_file
def MakeGFFindex(gff3file): GFFindex = gffpd.read_gff3(gff3file) return GFFindex.df
protein_id = args.pid if protein_id != "NOFILE": pid_DF = pd.read_csv(protein_id, sep='\t') else: pid_DF = False organism_name_in = args.nam strain_in = args.stn link_evi = args.gty locus_tag_counter = 0 #initialization PreContig = "" #initialization Contig_Count = 0 #initialization OUT_CHA = "" #initialization gff_df = gffpd.read_gff3(in_file) gff_df_col = gff_df.attributes_to_columns() gff_df_col = gff_df_col.sort_values('start') with open(out_path, mode='w') as f: f.write(OUT_CHA) for record in SeqIO.parse(fasta_in, 'fasta'): record = record.upper() #All bases should be capitalized. print("new_contig") features = [] #Initialize the features (sometimes it's not there) length = len(record) #Get array length NowContig = record.id #Get the array name. print("Processing " + NowContig) OUT_CHA += FASTA_CHA_SET(length, NowContig, organism_name_in, strain_in, mol_type_in) ##Detect and describe the gap region from fasta. print("Gap finding")
''' # Load the libraries import pandas as pd import pathlib import os import gffpandas.gffpandas as gffpd import csv # Creating in- and output file names inPath = "../../data/Reference_genome_and_annotation_file/PGSC_DM_V403_genes.gff" outPath = "../../data/Reference_genome_and_annotation_file/PGSC_DM_V403_genes_case-corrected.gff" # Create absolute path and read the file absInPath = os.path.abspath(inPath) rawFile = gffpd.read_gff3(absInPath) # Defining the function for changing the attributes column of the gff3 attributes data frame column def attrChange(attrString): # Create list of official GFF3 atrtributes from https://www.ncbi.nlm.nih.gov/datasets/docs/about-ncbi-gff3/ # This is needed to check for invalid cases in the input file offList = ['ID', 'Parent', 'Dbxref', 'Name', 'Note', 'Is_circular'] attrList = attrString.split(';') # Check for idiotic sepearation of values within the ';'-separated list ''' # Commented out because 'gff3ToGenePred' can't handle multiple names per annotation for j in range(len(attrList)): if '=' not in attrList[j]: attrList[j-1] = attrList[j-1] + ';' + attrList[j] ''' dummyList = attrList
import gffpandas.gffpandas as gffpd file = 'utr_features/MANE.GRCh38.v0.93.select_ensembl_genomic.gff' annotation = gffpd.read_gff3(file) attr_to_columns = annotation.attributes_to_columns() gene_to_transcript_df = attr_to_columns[['gene_name', 'transcript_id']] gene_to_transcript_df.drop_duplicates(inplace=True) gene_to_transcript_df.dropna(inplace=True) gene_to_transcript_df.to_csv('geneToTranscript.csv', index=False)
import gffpandas import gffpandas.gffpandas as gffpd import sys import re annotation = gffpd.read_gff3(sys.argv[1]) newattr = [] for x in annotation.df['attributes']: s = x.split('source_gene_common_name=')[1] s = s.split(';')[0] if s != 'None': newattr.append(re.sub('gene_id=.+?(?=;)', 'gene_id=' + s, x)) else: newattr.append(x) annotation.df['attributes'] = newattr annotation.to_gff3(re.sub('.gff3', '.geneid.gff3', sys.argv[1]))
gff=pd.read_table('tene_ALL_filenames_out.txt', names=['gff']) gff2=gff['gff'].tolist() overlap=pd.read_table("/Volumes/ubdata/mmcfad/NCBI_Genomes/Output_files/July_6_ALL.txt", sep='\t') expect_combined=[] table_df=pd.DataFrame(columns=["Sequence","Length","BP_inside","Expected","Hmer_inside(BP)", "Hmer_outside(BP)", "Observed"]) with open("/Volumes/ubdata/mmcfad/NCBI_Genomes/Output_files/Observed_expected_tene_ALL.txt", "w+") as w: for i in range(len(gff2)): print(gff2[i]) #Determining the expected amounts inside vs outside open reading frames orf=gffpd.read_gff3((gff2)[i]) df_orfLarge = pd.DataFrame(orf.df) selected_columns= df_orfLarge[["seq_id","start","end"]] ORF = selected_columns.copy() scaffolds=ORF['seq_id'].nunique() if scaffolds == 1: #Length of the genome from the header of the gff file header = orf.header split = header.split(" ") select_split=split[5].replace(";","=").split("=") length=int(select_split[3]) seq=ORF["seq_id"].unique()
import sys import gffpandas.gffpandas as gffpd from Bio import SeqIO from Bio.Seq import Seq nucleotides, features, samplename, cov, outdir = sys.argv[1:] ## read nucleotides seq = str(SeqIO.read(nucleotides, "fasta").seq) ## read features gffdict = gffpd.read_gff3(features).df.to_dict(orient="index") for k in gffdict.keys(): orfname = str(gffdict[k].get("attributes").split(";")[1].split("=")[-1]) start = int(gffdict[k].get("start")) end = int(gffdict[k].get("end")) sliced_seq = seq[start - 1 : end].replace("-", "") AminoAcids = Seq(sliced_seq).translate(to_stop=True) with open(f"{outdir}/{orfname}/{samplename}_{cov}.fa", "w") as out: out.write(f">{samplename}_ORF-{orfname}_cov_{cov}\n" f"{AminoAcids}\n")