print("#################################################################") print("# Welcome in extractSeqFastaFromLen (Version " + version + ") #") print("#################################################################") print('Start time: ', start_time, '\n') # Récupère le fichier de conf passer en argument fastaFile = os.path.abspath(args.fastaFile) outputfilename = os.path.abspath(args.paramoutfile) lenSize = args.lenSize keep = args.keep output_handle = open(outputfilename, "w") dicoSize = lenSeq2dict(fastaFile) dicoFasta = fasta2dict(fastaFile) filename = fastaFile.split('/')[-1].replace('.fasta', '') nbTotal = len(dicoFasta.keys()) count = 1 for ID in sorted(dicoSize.keys(), key=dicoSize.get, reverse=True): lenSeq = dicoSize[ID] sequence = dicoFasta[ID] strain = outputfilename.split('/')[-1].split('_')[0].replace( '.fasta', '') seqName = f"Scaffold_{count}" #seqName = '%s_%s'%(strain,seqName) descrip = "length={}".format(lenSeq) if str(sequence.seq).count('N') < (len(str(sequence.seq)) - 20): if keep == 'g' and lenSeq >= lenSize or (keep == 'l'
########### Gestion directory ############## verifFichier(gff) verifFichier(fasta) nameGFF = gff.split('/')[-1].replace('.gz','') ####################### main ################# # Create Variable for the start and stop codon, this variable help for evaluate the annotation start_codon = 'ATG' stop_codon = ['TGA','TAA','TAG'] # Create dictionary for cds and gene information (start, stop and strand) dico_cds = collections.defaultdict(dict) dico_gene = collections.defaultdict(list) augustus = False fasta_dico = fasta2dict(fasta) liste_scaff = fasta_dico.keys() # Initiate the dictionary for cds for elt in liste_scaff : dico_cds[elt] = collections.defaultdict(list) # Open gff for retrieve all information for cds and gene with open(gff,'rt') as gff_file : for line in gff_file : if '# This output was generated with AUGUSTUS' in line : augustus = True if line[0] != '#' : tabLine = line.split('\t') type = tabLine[2] # Parse gene information if type == 'gene' : id = tabLine[-1].strip().split(';')[0].replace('ID=','')
cdsFile = directory +"/"+ folder + '_cds.fna' protFile = directory +"/"+ folder + '_protein.faa' gffFile = directory +"/"+ folder + '_merge.gff3' f = open(gffFile,'r') lines = f.readlines() f.close() print('Creation dico GFF') for line in lines : if 'mRNA' in line : lineSplit = line.split('\t') ids = lineSplit[8].split('ID=')[-1].split(';Parent=')[0] position = 'pos=%s_%s:%s'%(lineSplit[0],lineSplit[3],lineSplit[4]) tools = lineSplit[1] dico_Gff[ids] = (position,tools) print('Creation dico des fasta') dico_cds = fasta2dict(cdsFile) dico_prot = fasta2dict(protFile) f = open(cdsFile.replace('.fna','.fasta'),'w') for idSeq in sorted(dico_cds.keys(), key=sort_human): position = dico_Gff[idSeq][0] tools = dico_Gff[idSeq][1] length = len(str(dico_cds[idSeq].seq)) seqObj = dico_cds[idSeq].seq record = SeqRecord(seqObj,id=idSeq,name=idSeq, description='| %s | %s | %s ' %(position,tools,length)) SeqIO.write(record,f, "fasta") f.close() f = open(protFile.replace('.faa','.fasta'),'w') for idSeq in sorted(dico_prot.keys(), key=sort_human): position = dico_Gff[idSeq.replace('0P','0T')][0] tools = dico_Gff[idSeq.replace('0P','0T')][1] length = len(str(dico_prot[idSeq].seq))
print(form("\n\t---------------------------------------------------------",'yellow','bold')) print("\t"+form("|",'yellow','bold')+form(" Welcome in QualityAssemblage (Version " + version + ") ",type='bold')+form("|",'yellow','bold')) print(form("\t---------------------------------------------------------",'yellow','bold')+'\n') ################## Main ################################ with open(outFile,'w') as f : f.write('n\tn:500\tL50\tmin\tN80\tN50\tN20\tE-size\tmax\tsum\tname\n') for files in os.listdir(directory): if files.endswith('.fasta') : print(files+ ' in process') Pathfile = directory+files isFasta(Pathfile) strain = recupId(Pathfile.split('/')[-1]) dico_fasta = fasta2dict(Pathfile) lengthGenome = 0 nbScaffold = 0 lengthN50 = 0 lengthN80 = 0 lengthN20 = 0 Esize = 0 L50 = 0 n500 = 0 first = True for elt in dico_fasta.values(): nbScaffold += 1
nbGeneA = 0 nbGeneB = 0 lengthGene = 0 lengthGenome = 0 for line in lines: if line[0] != '#': typeLine = line.split('\t')[2] typeAnnotation = line.split('\t')[1] posStart = line.split('\t')[3] posEnd = line.split('\t')[4] length = int(posEnd) - int(posStart) if typeLine == 'gene' and typeAnnotation == 'AUGUSTUS_BGPI': nbGeneA += 1 lengthGene = lengthGene + length if typeLine == 'gene' and typeAnnotation == 'BRAKER': nbGeneB += 1 lengthGene = lengthGene + length fastaPath = genome + '/' + name + '.fasta' dico_fasta = fasta2dict(fastaPath) for elt in dico_fasta.values(): lengthGenome = lengthGenome + len(elt.seq) f.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (name.center(10), str(nbGeneA).center(10), str(nbGeneB).center(10), str(nbGeneB + nbGeneA).center(10), str(round(lengthGene / (nbGeneA + nbGeneB), 2)).center(10), str(round(lengthGenome / 1000000, 2)).center(10))) print('%s Done' % name) f.close()