예제 #1
0
def test_write_gff3():
    gff3_df = generate_gff3_df()
    gff3_df.write_gff3('temp.gff')
    gff_content = open('temp.gff').read()
    assert gff_content == written_gff
    read_gff_output = gff3pd.read_gff3('temp.gff')
    read_in_file = gff3pd.read_gff3('fixtures/test_file.gff')
    pd.testing.assert_frame_equal(read_in_file._df, read_gff_output._df)
예제 #2
0
    def __init__(self, gff_file, resultant_file, fasta_txt_file,
                 qvalue_threshold, dataset_id, genome_name):

        self.dataset_id = dataset_id
        self.genome_name = genome_name
        self.QValue_threshold = qvalue_threshold
        # read annotation file.
        self.gff_file = gff_file
        self.annotation = gffpd.read_gff3(gff_file)
        self.resultant_df = pd.read_csv(resultant_file,
                                        sep='\t',
                                        float_precision='round_trip')

        col_of_interest = ['ProteinName', 'Sequence']
        self.fasta_df = pd.read_csv(fasta_txt_file, sep='\t')[col_of_interest]
        cols_to_rename = {'ProteinName': 'Protein'}
        self.fasta_df.rename(columns=cols_to_rename, inplace=True)
        self.fasta_df['Index'] = self.fasta_df.index + 1

        # saved to served multiple calls
        self.query_8_result = None
        self.peptide_report = None

        # building log excel files:
        file_loc = os.path.join(*self.gff_file.split('/')[:-1])
        self.writer = pd.ExcelWriter(f"/{file_loc}/{DATA_LOG}.xlsx",
                                     engine='xlsxwriter')

        # log dataframe.
        indexes = [f'query_{idx}' for idx in range(23)]
        indexes.extend(['Peptide_Report', 'Protein_Report', 'qc_metrics'])
        cols = ['#row', '#col', 'col_name']
        self.log_df = pd.DataFrame(columns=cols, index=indexes)
예제 #3
0
def main():
    parser = argparse.ArgumentParser(
        description="Generate outputs from ssearch36 output files")
    parser.add_argument("-s", "--ssearch36-in", dest="ssearch_in")
    parser.add_argument("-g", "--genome-in", dest="genome_in")
    parser.add_argument("-o", "--output-dir", dest="output_dir")
    parser.add_argument(
        "--gff-yeast",
        dest="gff_yeast",
        default=
        "/media/theboocock/data/Dropbox/PHDTHESIS/projects/gal_final_github/data/annotations/saccharomyces_cerevisiae.gff"
    )
    args = parser.parse_args()
    ssearch36_in = args.ssearch_in
    annot = args.gff_yeast
    genome_in = args.genome_in
    output_dir = args.output_dir
    out_prefix = os.path.basename(genome_in)
    out_new_all = out_prefix.split(".ss")[0]

    ssearch_df = get_ssearch36(args.ssearch_in)
    gff_yeast = gffpd.read_gff3(annot)
    gff_yeast_annot = (gff_yeast.attributes_to_columns())
    # Extract orfs that are only Verified and genes.
    gff_yeast_annot = gff_yeast_annot[
        (gff_yeast_annot["orf_classification"] == "Verified")
        & (gff_yeast_annot["type"] == "gene")]
    lengths = gff_yeast_annot["end"] - gff_yeast_annot["start"] + 1
    gff_yeast_annot["lengths"] = (lengths)

    ssearch_in = pandas.read_csv(ssearch36_in, sep="\t", header=None)
    (ssearch_in.columns) = [
        "gene", "strain", "percentage_match", "3", "4", "5", "ref_start",
        "ref_end", "start", "end", "E", "11"
    ]

    with open(os.path.join(output_dir, out_new_all + ".strict.fasta"),
              "w") as out_strict:
        with open(os.path.join(output_dir, out_new_all + ".permissive.fasta"),
                  "w") as out_f:
            for gene in gff_yeast_annot["Name"]:
                gene = str(gene)
                if (any(gff_yeast_annot["Name"].isin([gene]))):
                    gene_length = int(gff_yeast_annot[gff_yeast_annot["Name"]
                                                      == gene]["lengths"])
                    #f_gene = partial(get_gene_from_ssearch36_script, gene, genome_in, gene_length)
                    gene_out = get_gene_from_ssearch36_script(
                        genome_in, ssearch_in, gene, gene_length)
                    #gene_out = [f_gene(df)]
                    print(gene_out)
                    gene_strict = gene_out[0][1]
                    gene_perm = gene_out[1][1]
                    if gene_strict is not None:
                        out_strict.write(">" + gene + "\n" + gene_strict +
                                         "\n")
                    if gene_perm is not None:
                        out_f.write(">" + gene + "\n" + gene_perm + "\n")
예제 #4
0
def find_overlap(pairs, file_output):
    with open(file_output, "w+") as w:
        Hmer_total=[]
        df_Counts_Total=pd.DataFrame(columns=['Sequence','Length','Base','InORF(T/F)'])

        for i in range(len(pairs)):
            print(pairs[i][0])
            print(pairs[i][1])
            #create data frames from  txt - hmer.py output and gff prodigal output files
            dfH = pd.read_table(pairs[i][0], names=["Sequence","Acession_ID","Start","End","Length","Base","Info"], sep='\t')

            ORFLarge = gffpd.read_gff3(pairs[i][1])
            #read gff makes a strange data frame structure - convert to standard Pandas data frame
            dfORFLarge = pd.DataFrame(ORFLarge.df)


            #remove excess info from gff data frame
            selected_columns= dfORFLarge[["seq_id","start","end"]]
            dfORF = selected_columns.copy()

            scaffolds=dfH['Sequence'].nunique()
            if scaffolds == 1:

                #using numpy array to combine the subtraction of Hmer from ORF for both start and end

                arrayH=np.array(dfH[['Start','End']])

                arrayORF=np.array(dfORF[['start','end']])

                #create a 3D array which is sets=numbers of hmers, rows=#open reading frames columns=2 (start,end)
                combined=(arrayORF - arrayH[:,np.newaxis]).reshape(-1,arrayORF.shape[0],2)
                #Find and denote true or false for when True = (negative -Start), (positive -end)

                T_F=np.where((combined[:,:,0] <0)&(combined[:,:,1]>0),True,False).reshape(-1,1,combined.shape[1])

                #if set contains True = True if not = False, reduce to one value per set
                T_F_clean = np.any((T_F[:,0,:]==True),axis=1).reshape(-1,1)

                T_F_dataframe=pd.DataFrame(data=T_F_clean,columns=['In_ORF'])

                #add the in ORF column to the hmer data frames
                dfH['InORF(T/F)']=T_F_dataframe

                dfCounts=dfH[['Sequence','Acession_ID','Length','Base','InORF(T/F)','Info']].value_counts().sort_index().reset_index(name="Counts")

                Hmer_total.append(dfCounts)

                i+=1
            else:
                print("Skip")
                i+=1

        Counts=pd.concat(Hmer_total)

        Counts.to_csv(w, sep="\t", index=False)
예제 #5
0
def f_import_transform_gtf(global_para):
    df_gff = gffpd.read_gff3(global_para.gtf_file)
    df_gff_all = df_gff.df
    df_gff_all = df_gff_all.astype({"seq_id":"str"})
    df_gff_cds = df_gff_all.query('type == @global_para.feature_type_cds')
    if global_para.source  != "all":
        tmp_list_source = global_para.source.split(',')
        df_gff_cds = df_gff_cds.query('source in @tmp_list_source')
    df_gff_cds_expand = f_gtf_attributes_expand(df_gff_cds)
    df_gff_cds_expand.rename(columns = {global_para.featureid_gene_id :'gene_id', global_para.featureid_gene_name :'gene_name', global_para.featureid_transcript_id :'transcript_id'},inplace = True)
    return df_gff_cds_expand
def main(map_file, gtf_file):
    df_gtf = gffpd.read_gff3(map_file).df.reset_index()

    df_gtf['transcript_id'] = df_gtf['phase'].apply(
        lambda x: dict([e.split('=') for e in x.split(';')]).get('Note'))

    for row in df_gtf.itertuples():
        uniprot = row.index
        tid = row.transcript_id

        if tid is None:
            continue

        print(uniprot, tid)
        append_line(gtf_file, tid, uniprot)
예제 #7
0
    sep='\t')

expect_combined = []

table_df = pd.DataFrame(columns=[
    "Sequence", "Length", "BP_inside", "Expected", "Hmer_inside(BP)",
    "Hmer_outside(BP)", "Observed"
])

with open("/Volumes/ubdata/mmcfad/NCBI_Genomes/Output_files/Code_content.txt",
          "w+") as w:

    for i in range(len(files_orf)):
        print(files_orf[i])
        #Determining the expected amounts inside vs outside open reading frames
        orf = gffpd.read_gff3(files_orf[i])

        df_orfLarge = pd.DataFrame(orf.df)
        selected_columns = df_orfLarge[["seq_id", "start", "end"]]
        ORF = selected_columns.copy()
        scaffolds = ORF['seq_id'].nunique()

        if scaffolds == 1:
            #Length of the genome from the header of the gff file
            header = orf.header

            split = header.split(" ")
            print(split)
            select_split = split[-1].replace(";", "=").split("=")
            print(select_split)
            code = select_split[4]
예제 #8
0
def test_to_gff3():
    assert gff_content == written_gff
    read_gff_output = gff3pd.read_gff3('temp.gff')
    read_in_file = gff3pd.read_gff3('fixtures/test_file.gff')
    pd.testing.assert_frame_equal(read_in_file.df, read_gff_output.df)
예제 #9
0
def generate_gff3_df():
    read_in_file = gff3pd.read_gff3('fixtures/test_file.gff')
    return read_in_file
예제 #10
0
def MakeGFFindex(gff3file):
    GFFindex = gffpd.read_gff3(gff3file)

    return GFFindex.df
예제 #11
0
파일: GFF2MSS.py 프로젝트: maedat/GFF2MSS
    protein_id = args.pid
    if protein_id != "NOFILE":
        pid_DF = pd.read_csv(protein_id, sep='\t')
    else:
        pid_DF = False
    organism_name_in = args.nam
    strain_in = args.stn

    link_evi = args.gty

    locus_tag_counter = 0  #initialization
    PreContig = ""  #initialization
    Contig_Count = 0  #initialization
    OUT_CHA = ""  #initialization

    gff_df = gffpd.read_gff3(in_file)
    gff_df_col = gff_df.attributes_to_columns()
    gff_df_col = gff_df_col.sort_values('start')
    with open(out_path, mode='w') as f:
        f.write(OUT_CHA)
    for record in SeqIO.parse(fasta_in, 'fasta'):
        record = record.upper()  #All bases should be capitalized.
        print("new_contig")
        features = []  #Initialize the features (sometimes it's not there)
        length = len(record)  #Get array length
        NowContig = record.id  #Get the array name.
        print("Processing " + NowContig)
        OUT_CHA += FASTA_CHA_SET(length, NowContig, organism_name_in,
                                 strain_in, mol_type_in)
        ##Detect and describe the gap region from fasta.
        print("Gap finding")
'''

# Load the libraries
import pandas as pd
import pathlib
import os
import gffpandas.gffpandas as gffpd
import csv

# Creating in- and output file names
inPath = "../../data/Reference_genome_and_annotation_file/PGSC_DM_V403_genes.gff"
outPath = "../../data/Reference_genome_and_annotation_file/PGSC_DM_V403_genes_case-corrected.gff"

# Create absolute path and read the file
absInPath = os.path.abspath(inPath)
rawFile = gffpd.read_gff3(absInPath)


# Defining the function for changing the attributes column of the gff3 attributes data frame column
def attrChange(attrString):
    # Create list of official GFF3 atrtributes from https://www.ncbi.nlm.nih.gov/datasets/docs/about-ncbi-gff3/
    # This is needed to check for invalid cases in the input file
    offList = ['ID', 'Parent', 'Dbxref', 'Name', 'Note', 'Is_circular']
    attrList = attrString.split(';')
    # Check for idiotic sepearation of values within the ';'-separated list
    ''' # Commented out because 'gff3ToGenePred' can't handle multiple names per annotation
    for j in range(len(attrList)):
        if '=' not in attrList[j]:
            attrList[j-1] = attrList[j-1] + ';' + attrList[j]
    '''
    dummyList = attrList
예제 #13
0
import gffpandas.gffpandas as gffpd


file = 'utr_features/MANE.GRCh38.v0.93.select_ensembl_genomic.gff'

annotation = gffpd.read_gff3(file)
attr_to_columns = annotation.attributes_to_columns()

gene_to_transcript_df = attr_to_columns[['gene_name', 'transcript_id']]
gene_to_transcript_df.drop_duplicates(inplace=True)
gene_to_transcript_df.dropna(inplace=True)
gene_to_transcript_df.to_csv('geneToTranscript.csv', index=False)
import gffpandas
import gffpandas.gffpandas as gffpd
import sys
import re

annotation = gffpd.read_gff3(sys.argv[1])

newattr = []
for x in annotation.df['attributes']:
    s = x.split('source_gene_common_name=')[1]
    s = s.split(';')[0]
    if s != 'None':
        newattr.append(re.sub('gene_id=.+?(?=;)', 'gene_id=' + s, x))
    else:
        newattr.append(x)

annotation.df['attributes'] = newattr
annotation.to_gff3(re.sub('.gff3', '.geneid.gff3', sys.argv[1]))
예제 #15
0
gff=pd.read_table('tene_ALL_filenames_out.txt', names=['gff'])
gff2=gff['gff'].tolist()


overlap=pd.read_table("/Volumes/ubdata/mmcfad/NCBI_Genomes/Output_files/July_6_ALL.txt", sep='\t')

expect_combined=[]

table_df=pd.DataFrame(columns=["Sequence","Length","BP_inside","Expected","Hmer_inside(BP)", "Hmer_outside(BP)", "Observed"])

with open("/Volumes/ubdata/mmcfad/NCBI_Genomes/Output_files/Observed_expected_tene_ALL.txt", "w+") as w:

    for i in range(len(gff2)):
        print(gff2[i])
        #Determining the expected amounts inside vs outside open reading frames
        orf=gffpd.read_gff3((gff2)[i])

        df_orfLarge = pd.DataFrame(orf.df)
        selected_columns= df_orfLarge[["seq_id","start","end"]]
        ORF = selected_columns.copy()
        scaffolds=ORF['seq_id'].nunique()

        if scaffolds == 1:
            #Length of the genome from the header of the gff file
            header = orf.header

            split = header.split(" ")
            select_split=split[5].replace(";","=").split("=")
            length=int(select_split[3])

            seq=ORF["seq_id"].unique()
예제 #16
0
import sys

import gffpandas.gffpandas as gffpd
from Bio import SeqIO
from Bio.Seq import Seq

nucleotides, features, samplename, cov, outdir = sys.argv[1:]

## read nucleotides
seq = str(SeqIO.read(nucleotides, "fasta").seq)

## read features
gffdict = gffpd.read_gff3(features).df.to_dict(orient="index")

for k in gffdict.keys():
    orfname = str(gffdict[k].get("attributes").split(";")[1].split("=")[-1])
    start = int(gffdict[k].get("start"))
    end = int(gffdict[k].get("end"))

    sliced_seq = seq[start - 1 : end].replace("-", "")
    AminoAcids = Seq(sliced_seq).translate(to_stop=True)

    with open(f"{outdir}/{orfname}/{samplename}_{cov}.fa", "w") as out:
        out.write(f">{samplename}_ORF-{orfname}_cov_{cov}\n" f"{AminoAcids}\n")