def get_Short(gene, auxBar):
    blastPath = 'blastp'
    genesList = [str(gene)]

    pathtoDir = os.path.join(os.path.dirname(gene), "short")

    try:
        if not os.path.exists(pathtoDir):
            os.makedirs(pathtoDir)
    except Exception as e:
        pass
        #~ print (e)

    for gene in genesList:
        #~ print ("processing " +gene)

        pathtoDir = os.path.join(os.path.dirname(gene), "short")

        shortgene = os.path.join(os.path.dirname(gene), "short",
                                 os.path.basename(gene))
        shortgene = shortgene.replace(".fasta", "_short.fasta")

        tempgene = os.path.join(os.path.dirname(shortgene), "temp",
                                os.path.basename(gene).replace(".fasta", ""))
        tempgeneProt = os.path.join(tempgene, os.path.basename(gene))
        tempgeneProt2 = os.path.join(tempgene, os.path.basename(gene))
        tempgeneProt2 = tempgeneProt2.replace(".fasta", "2.fasta")

        tempgeneProtFasta = ''
        tempgeneProt2Fasta = ''
        shortfasta = ''

        if not os.path.exists(tempgene):
            os.makedirs(tempgene)

        #~ gene_fp2 = HTSeq.FastaReader(gene)

        counter = 0
        alleleI = 0
        var = {}

        geneScorePickle = shortgene + '_bsr.txt'
        selfscores = []
        fasta_corrected = ''
        total_alleles = 0
        error_alleles = 0
        for allele in SeqIO.parse(gene, "fasta", generic_dna):
            total_alleles += 1
            try:
                translatedSequence, sortedSeq, originalSeq = translateSeq(
                    str(allele.seq.upper()))

                if not originalSeq:
                    fasta_corrected += '>' + str(
                        allele.name) + '\n' + str(sortedSeq) + '\n'
                #~ alleleI=int(((allele.name).split("_"))[-1])
                alleleI = int(((allele.name).split("_"))[-1])
                if counter < 1:

                    #add first allele as short and calculate self bsr

                    counter += 1

                    shortfasta = '>' + str(allele.name) + '\n' + str(
                        allele.seq.upper()) + '\n'

                    tempgeneProtFasta = '>' + str(
                        allele.name) + '\n' + str(translatedSequence) + '\n'

                    Gene_Blast_DB_name = CommonFastaFunctions.Create_Blastdb_no_fasta(
                        tempgeneProt, 1, True, tempgeneProtFasta)

                    # --- get BLAST score ratio --- #

                    cline = NcbiblastpCommandline(cmd=blastPath,
                                                  db=Gene_Blast_DB_name,
                                                  evalue=0.001,
                                                  outfmt=5,
                                                  num_threads=1)
                    out, err = cline(stdin=tempgeneProtFasta)
                    psiblast_xml = StringIO(out)
                    blast_records = NCBIXML.parse(psiblast_xml)

                    allelescore = []

                    for blast_record in blast_records:

                        for alignment in blast_record.alignments:

                            for match in alignment.hsps:

                                allelescore.append(int(match.score))

                    selfbsr = float(allelescore[0]) / float(allelescore[0])

                    var[alleleI] = allelescore[0]

                    selfscores.append(allelescore[0])

                else:

                    #calculate selfbsr for each allele

                    translatedSequence, sortedSeq, originalSeq = translateSeq(
                        str(allele.seq.upper()))

                    tempgeneProt2Fasta = '>' + str(
                        allele.name) + '\n' + str(translatedSequence) + '\n'

                    Gene_Blast_DB_name = CommonFastaFunctions.Create_Blastdb_no_fasta(
                        tempgeneProt2, 1, True, tempgeneProt2Fasta)

                    # --- get BLAST score ratio --- #
                    cline = NcbiblastpCommandline(cmd=blastPath,
                                                  db=Gene_Blast_DB_name,
                                                  evalue=0.001,
                                                  outfmt=5,
                                                  num_threads=1)
                    out, err = cline(stdin=tempgeneProt2Fasta)
                    psiblast_xml = StringIO(out)
                    blast_records = NCBIXML.parse(psiblast_xml)

                    allelescore = []

                    for blast_record in blast_records:

                        for alignment in blast_record.alignments:

                            for match in alignment.hsps:

                                allelescore.append(int(match.score))

                    selfscore = allelescore[0]
                    selfbsr = float(selfscore) / float(selfscore)

                    #calculate bsr for the allele vs all previous alleles

                    # --- get BLAST score ratio --- #
                    cline = NcbiblastpCommandline(cmd=blastPath,
                                                  db=Gene_Blast_DB_name,
                                                  evalue=0.001,
                                                  outfmt=5,
                                                  num_threads=1)
                    out, err = cline(stdin=tempgeneProtFasta)
                    psiblast_xml = StringIO(out)
                    blast_records = NCBIXML.parse(psiblast_xml)

                    allelescore = []
                    allelescoreId = []

                    bestbsr = 0
                    bestscore = 0
                    for blast_record in blast_records:

                        for alignment in blast_record.alignments:

                            for match in alignment.hsps:

                                alleleMatchid = int(
                                    (blast_record.query_id.split("_"))[-1])

                                bsr = float(match.score) / float(
                                    selfscores[int(alleleMatchid - 1)])
                                if bsr > bestbsr and match.score > bestscore and bsr >= 0.6:
                                    bestbsr = bsr
                                    bestscore = match.score

                    if bestbsr >= 0.6 and bestbsr < 0.7:

                        shortfasta += '>' + str(allele.name) + '\n' + str(
                            allele.seq.upper()) + '\n'

                        var[alleleI] = selfscore
                        selfscores.append(selfscore)

                        tempgeneProtFasta += '>' + str(
                            allele.name) + '\n' + str(
                                translatedSequence) + '\n'

            except Exception as e:
                #~ print ('Error on line {}'.format(sys.exc_info()[-1].tb_lineno))
                #print (str(allele.name)+" "+str(e))
                #print ("allele not translatable")
                error_alleles += 1

        #~ print ("processed " +gene)

        if error_alleles < total_alleles:

            with open(geneScorePickle, 'wb') as f:
                pickle.dump(var, f)

            with open(shortgene, 'w') as f:
                f.write(shortfasta)

        else:
            print("ATTENTION!!!111 \n" + str(gene) +
                  " has no correct aleles, the file will be removed!!")
            os.remove(gene)

        if len(fasta_corrected) > 1:
            with open(gene, 'w') as f:
                f.write(fasta_corrected)

        #print status bar
        if gene in auxBar:
            auxlen = len(auxBar)
            index = auxBar.index(gene)
            print("[" + "=" * index + ">" + " " * (auxlen - index) +
                  "] processed " + str(int((float(index) / auxlen) * 100)) +
                  "%")

    return True
def main(input_file, temppath, blastPath, verbose, bsrTresh, sizeTresh, ns):

    if verbose == 'True':
        verbose = True
    else:
        verbose = False

    argumentList = []
    with open(input_file, 'rb') as f:
        argumentList = pickle.load(f)

    if verbose:
        def verboseprint(*args):

            for arg in args:
                print(arg),
            print
    else:
        verboseprint = lambda *a: None

    geneFile = argumentList[0]

    verboseprint("Using gene: " + str(geneFile))
    shortgeneFile = os.path.join(os.path.dirname(argumentList[0]), "short", os.path.basename(argumentList[0]))
    shortgeneFile = shortgeneFile.replace(".fasta", "_short.fasta")
    genomesList = argumentList[1]
    genesList = argumentList[2]

    newListgenes = []
    with open(genesList, 'r') as gene_fp:
        for gene in gene_fp:
            gene = gene.rstrip('\n')
            gene = gene.rstrip('\r')
            newListgenes.append(gene)

    statusbar = float(newListgenes.index(str(geneFile))) / len(newListgenes)
    locusnumber = (newListgenes.index(str(geneFile)))
    totalocusnumber = len(newListgenes)
    basepath = os.path.join(temppath, os.path.splitext(geneFile)[0])
    newDNAAlleles2Add2Fasta = ''
    newDNAAlleles2Add2shortFasta = ''
    proteinFastaString = ''

    print('\rProcessing ' + os.path.basename(geneFile) + ". Start " + time.strftime("%H:%M:%S-%d/%m/%Y") + " Locus " + str(
          locusnumber) + " of " + str(totalocusnumber) + ". Done " + str(int(statusbar * 100)) + "%.", end="")

    if not os.path.exists(basepath):
        os.makedirs(basepath)

    fullAlleleList = []
    fullAlleleNameList = []
    alleleI = 0
    # get full list of alleles from main gene file and last allele number id
    for allele in SeqIO.parse(geneFile, "fasta"):
        aux = allele.id.split("_")
        if len(aux) < 2:
            alleleI = aux[0]
        else:
            alleleI = aux[-1]

        fullAlleleList.append(str(allele.seq.upper()))
        fullAlleleNameList.append(allele.id)

    resultsList = []
    i = 0
    perfectMatchIdAllele = []
    perfectMatchIdAllele2 = []
    allelescores = {}
    listShortAllelesNames = []

    verboseprint("Getting BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

    geneScorePickle = os.path.abspath(shortgeneFile) + '_bsr.txt'

    # check if bsr as arealdy been calculated and recalculate it if necessary

    if os.path.isfile(geneScorePickle):
        allelescores, alleleList, listShortAllelesNames = getBlastScoreRatios(shortgeneFile, basepath, False, verbose,
                                                                              blastPath)

    else:
        allelescores, alleleList, listShortAllelesNames = getBlastScoreRatios(shortgeneFile, basepath, True, verbose,
                                                                              blastPath)

    with open(os.path.join(basepath, str(os.path.basename(shortgeneFile) + '_protein.fasta')), 'r') as myfile:
        proteinFastaString = myfile.read()
        proteinFastaString += "\n"

    verboseprint("Finished BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

    verboseprint("starting allele call blast at: " + time.strftime("%H:%M:%S-%d/%m/%Y"))
    for genomeFile in genomesList:
        verboseprint(genomeFile)
        bestmatch = [0, 0, False, '',
                     0]  # score, score ratio, perfectmatch, key name of the DNA sequence string, allele ID
        currentGenomeDict = {}
        currentCDSDict = {}

        # load the CDS from the genome to a dictionary
        filepath = os.path.join(temppath, str(os.path.basename(genomeFile)) + "_ORF_Protein.txt")

        with open(filepath, 'rb') as f:
            currentCDSDict = pickle.load(f)

        try:
            intersection = set(fullAlleleList).intersection(currentCDSDict.values())
            intersection = list(intersection)
            if len(intersection) > 1:
                perfectMatchIdAllele.append('NIPHEM')
                perfectMatchIdAllele2.append('NIPHEM')
                verboseprint(os.path.basename(genomeFile) + " has " + str(
                    len(intersection)) + " multiple exact match : " + os.path.basename(
                    geneFile) + " MULTIPLE ALLELES as EXACT MATCH")
                raise ValueError("MULTIPLE ALLELES as EXACT MATCH")

            elif len(intersection) == 1:
                alleleStr = intersection[0]
                # it doenst return both keys with equal values
                # ~ elem=currentCDSDict.keys()[currentCDSDict.values().index(alleleStr)]

                elem = [key for key, value in currentCDSDict.items() if value == alleleStr]
                if len(elem) > 1:
                    perfectMatchIdAllele.append('NIPHEM')
                    perfectMatchIdAllele2.append('NIPHEM')
                    verboseprint(os.path.basename(genomeFile) + " has " + str(
                        len(intersection)) + " multiple exact match : " + os.path.basename(
                        geneFile) + " MULTIPLE ALLELES as EXACT MATCH")
                    raise ValueError("MULTIPLE ALLELES as EXACT MATCH")

                contigname = elem[0].split("&")
                matchLocation = contigname[2]
                # starting CDS base need to be +1
                matchLocation = matchLocation.split("-")
                matchLocation = [int(matchLocation[0]) + 1, int(matchLocation[1])]
                contigname = (contigname[0]).replace(">", "")
                alleleName = ''
                alleleMatchid = 0

                alleleName = fullAlleleNameList[fullAlleleList.index(alleleStr)]
                alleleMatchid = (alleleName.split("_"))[-1]

                try:
                    containedInfo = (alleleName.split("_"))[1]
                except:
                    containedInfo = ''

                perfectMatchIdAllele.append(alleleMatchid)

                if matchLocation[0] > matchLocation[1]:
                    perfectMatchIdAllele2.append(
                        str(contigname) + "&" + str(matchLocation[0]) + "-" + str(matchLocation[1]) + "&" + "-")
                else:

                    perfectMatchIdAllele2.append(
                        str(contigname) + "&" + str(matchLocation[0]) + "-" + str(matchLocation[1]) + "&" + "+")

                # check if atributed allele is contained or contains
                if containedInfo == "CD":
                    resultsList.append([(os.path.basename(genomeFile)), str(alleleMatchid), containedInfo.rstrip()])
                elif containedInfo == "CS":
                    resultsList.append([(os.path.basename(genomeFile)), str(alleleMatchid), containedInfo.rstrip()])
                else:
                    pass

                raise ValueError("EQUAL")
        except Exception as e:
            # ~ exc_type, exc_obj, exc_tb = sys.exc_info()
            # ~ fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            # ~ print(exc_tb.tb_lineno)
            # ~ print e
            continue

        else:
            verboseprint("Blasting alleles on genome at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

            blast_out_file = os.path.join(basepath, "blastdbs/" + os.path.basename(geneFile) + '_List.xml')

            Gene_Blast_DB_name = os.path.join(temppath, str(os.path.basename(genomeFile)) + "/" + str(
                os.path.basename(genomeFile)) + "_db")

            cline = NcbiblastpCommandline(cmd=blastPath, db=Gene_Blast_DB_name, evalue=0.001,
                                          outfmt=5, max_target_seqs=10, max_hsps=10, num_threads=1)

            out, err = cline(stdin=proteinFastaString)

            psiblast_xml = StringIO(out)
            blast_records = NCBIXML.parse(psiblast_xml)

            verboseprint("Blasted alleles on genome at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

            alleleSizes = []
            for allele in fullAlleleList:
                alleleSizes.append(len(allele))

            biggestSizeAllele = max(alleleSizes)

            # get mode allele size
            moda = max(set(alleleSizes), key=alleleSizes.count)
            contador = Counter(alleleSizes).most_common()

            # if most common allele size appears 1 time, get first allele size
            if (contador[0])[1] == 1:
                moda = alleleSizes[0]

            try:

                # iterate through the blast results
                for blast_record in blast_records:

                    locationcontigs = []

                    for alignment in blast_record.alignments:

                        # select the best match
                        for match in alignment.hsps:

                            # query id comes with query_id, not name of the allele
                            # the query will always be a representative allele
                            # we get the index of the representative sequence
                            alleleMatchid = int((blast_record.query_id.split("_"))[-1])

                            # query_id starts with 1 and we have to subtract 1 to get index 0
                            # we get the identifier of the representative (including '*') with the int index
                            alleleMatchid2 = (((listShortAllelesNames[alleleMatchid - 1]).split("_"))[-1])

                            scoreRatio = float(match.score) / float(allelescores[alleleMatchid2])

                            cdsStrName = (alignment.title.split(" "))[1]

                            AlleleDNAstr = alleleList[int(alleleMatchid) - 1]
                            verboseprint("BSR : " + str(scoreRatio))

                            if scoreRatio >= bsrTresh:
                                locationcontigs.append(cdsStrName)

                            # select the best match from BLAST results
                            if scoreRatio == 1 and match.score > bestmatch[0]:
                                bestmatch = [match.score, scoreRatio, False, cdsStrName, int(alleleMatchid), match,
                                             len(AlleleDNAstr)]

                            elif (match.score > bestmatch[0] and scoreRatio >= bsrTresh and scoreRatio > bestmatch[1] and bestmatch[2] is False):
                                bestmatch = [match.score, scoreRatio, False,
                                             cdsStrName, int(alleleMatchid), match,
                                             len(AlleleDNAstr)]

                verboseprint("Classifying the match at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

                # if no best match was found it's a Locus Not Found

                # check for ambiguious bases
                if not bestmatch[0] == 0:
                    alleleStr = currentCDSDict[">" + bestmatch[3]]
                    listFoundAmbiguities = []
                    listambiguousBases = ['K', 'M', 'R', 'Y', 'S', 'W', 'B', 'V', 'H', 'D', 'X', 'N', '-', '.']
                    listFoundAmbiguities = [e for e in listambiguousBases if e in alleleStr]

                if bestmatch[0] == 0 or len(listFoundAmbiguities) > 0:

                    ###################
                    # LOCUS NOT FOUND #
                    ###################
                    if bestmatch[0] == 0:
                        perfectMatchIdAllele.append('LNF')
                        perfectMatchIdAllele2.append('LNF')
                        verboseprint("Locus not found, no matches \n")
                    else:

                        perfectMatchIdAllele.append('LNF')
                        perfectMatchIdAllele2.append('LNF')
                        verboseprint("Locus has strange base \n")

                # if more than one BSR >0.6 in two different CDSs it's a Non Paralog Locus
                elif len(list(set(locationcontigs))) > 1:
                    verboseprint("NIPH", "")
                    perfectMatchIdAllele.append('NIPH')
                    perfectMatchIdAllele2.append('NIPH')
                    for elem in locationcontigs:
                        verboseprint(elem)

                # if match with BSR >0.6 and not equal DNA sequences
                else:
                    # load the contig info of the genome to a dictionary
                    for contig in SeqIO.parse(genomeFile, "fasta"):
                        currentGenomeDict[contig.id] = len(str(contig.seq.upper()))

                    match = bestmatch[5]
                    geneLen = bestmatch[6]
                    alleleStr = currentCDSDict[">" + bestmatch[3]]
                    contigname = bestmatch[3]

                    contigname = contigname.split("&")
                    matchLocation = contigname[2]
                    matchLocation = matchLocation.split("-")
                    matchLocation = [int(matchLocation[0]) + 1, matchLocation[1]]
                    contigname = contigname[0]

                    bestMatchContigLen = currentGenomeDict[contigname]

                    protSeq, alleleStr = translateSeq(alleleStr)
                    # get extra space to the right and left between the allele and match and check if it's still inside the contig

                    rightmatchAllele = geneLen - ((int(match.query_end) + 1) * 3)
                    leftmatchAllele = ((int(match.query_start) - 1) * 3)

                    Reversed = False
                    # ~ if Reversed swap left and right contig extra
                    Reversed = False
                    if int(matchLocation[1]) < int(matchLocation[0]):
                        rightmatchContig = bestMatchContigLen - int(matchLocation[0])
                        leftmatchContig = int(matchLocation[1])
                        aux = rightmatchAllele
                        rightmatchAllele = leftmatchAllele
                        leftmatchAllele = aux
                        Reversed = True

                    else:
                        rightmatchContig = bestMatchContigLen - int(matchLocation[1])
                        leftmatchContig = int(matchLocation[0])

                    ###########################
                    # LOCUS ON THE CONTIG TIP #
                    ###########################

                    # check if contig is smaller than the matched allele
                    if leftmatchContig < leftmatchAllele and rightmatchContig < rightmatchAllele:

                        perfectMatchIdAllele.append('LOTSC')
                        perfectMatchIdAllele2.append('LOTSC')

                        verboseprint(match, contigname, geneFile, leftmatchAllele, rightmatchAllele,
                                     "Locus is bigger than the contig \n")

                    elif leftmatchContig < leftmatchAllele:

                        perfectMatchIdAllele.append('PLOT3')
                        perfectMatchIdAllele2.append('PLOT3')

                        verboseprint(match, contigname, geneFile, leftmatchAllele, rightmatchAllele,
                                     "Locus is on the 3' tip of the contig \n")

                    elif rightmatchContig < rightmatchAllele:

                        perfectMatchIdAllele.append('PLOT5')
                        perfectMatchIdAllele2.append('PLOT5')

                        verboseprint(match, contigname, geneFile, leftmatchAllele, rightmatchAllele,
                                     "Locus is on the 5' tip of the contig \n")

                    elif sizeTresh is not None and (float(len(alleleStr)) > moda + (moda * sizeTresh)):

                        verboseprint("Locus is larger than mode", moda, alleleStr)

                        perfectMatchIdAllele.append('ALM')
                        perfectMatchIdAllele2.append('ALM')

                    elif sizeTresh is not None and (float(len(alleleStr)) < moda - (moda * sizeTresh)):

                        verboseprint("Locus is smaller than mode", moda, alleleStr)

                        perfectMatchIdAllele.append('ASM')
                        perfectMatchIdAllele2.append('ASM')

                    else:
                        #######################
                        # ADD INFERRED ALLELE #		# a new allele
                        #######################

                        wasContained = False
                        tagAuxC = 'S'
                        if ns is True:
                            alleleIaux = "*"+str(int(alleleI.replace("*", ""))+1)
                            alleleI = alleleIaux
                        else:
                            alleleI = str(int(alleleI)+1)
                            alleleIaux = alleleI

                        for alleleaux in fullAlleleList:
                            if alleleStr in alleleaux:
                                alleleName = fullAlleleNameList[fullAlleleList.index(alleleaux)]
                                alleleMatchid = (alleleName.split("_"))[-1]
                                tagAuxC = 'CD' + alleleMatchid.rstrip()
                                resultsList.append([(os.path.basename(genomeFile)), str(alleleIaux), tagAuxC])
                                break
                            elif alleleaux in alleleStr:
                                alleleName = fullAlleleNameList[fullAlleleList.index(alleleaux)]
                                alleleMatchid = (alleleName.split("_"))[-1]
                                tagAuxC = 'CS' + alleleMatchid.rstrip()
                                resultsList.append([(os.path.basename(genomeFile)), str(alleleIaux), tagAuxC])
                                break

                        if not wasContained:
                            tagAux = 'INF'

                            perfectMatchIdAllele.append(tagAux + "-" + str(alleleIaux))

                            if not Reversed:
                                perfectMatchIdAllele2.append(str(contigname) + "&" + str(matchLocation[0]) + "-" + str(
                                    matchLocation[1]) + "&" + "+")
                            else:
                                perfectMatchIdAllele2.append(str(contigname) + "&" + str(matchLocation[0]) + "-" + str(
                                    matchLocation[1]) + "&" + "-")

                            verboseprint(
                                "New allele! Adding allele " + tagAux + str(alleleIaux) + " to the database\n")

                            # --- add the new allele to the gene fasta --- #
                            appendAllele = '>' + str((((os.path.basename(geneFile)).split("."))[0]).replace("_",
                                                                                                            "-")) + "_" + tagAuxC + "_" + (
                                               str(os.path.basename(genomeFile))).replace("_", "-") + "_" + time.strftime("%d/%m/%YT%H:%M:%S") + '_' + str(
                                alleleIaux)

                            newDNAAlleles2Add2Fasta += appendAllele + "\n" + alleleStr + '\n'

                            fullAlleleList.append(alleleStr)
                            fullAlleleNameList.append(appendAllele)

                            if float(bestmatch[1]) >= bsrTresh and float(bestmatch[1]) < bsrTresh + 0.1:

                                newDNAAlleles2Add2shortFasta += appendAllele + "\n" + alleleStr + '\n'

                                geneTransalatedPath2 = os.path.join(basepath, str(
                                    os.path.basename(shortgeneFile) + '_protein2.fasta'))
                                geneTransalatedPath = os.path.join(basepath, str(
                                    os.path.basename(shortgeneFile) + '_protein.fasta'))

                                proteinFastaString += '>' + alleleIaux + '\n' + str(protSeq) + '\n'

                                match = bestmatch[5]

                                # --- remake blast DB and recalculate the BSR for the locus --- #
                                alleleList.append(alleleStr)
                                listShortAllelesNames.append(appendAllele)

                                sequence_2_blast = '>' + alleleIaux + '\n' + str(protSeq)
                                Gene_Blast_DB_name2 = CommonFastaFunctions.Create_Blastdb_no_fasta(geneTransalatedPath2, 1, True, sequence_2_blast)

                                verboseprint("Re-calculating BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))
                                allelescores, alleleList, listShortAllelesNames = reDogetBlastScoreRatios(sequence_2_blast,
                                                                                                          basepath,
                                                                                                          alleleIaux,
                                                                                                          allelescores,
                                                                                                          Gene_Blast_DB_name2,
                                                                                                          alleleList,
                                                                                                          geneScorePickle,
                                                                                                          verbose,
                                                                                                          blastPath,
                                                                                                          listShortAllelesNames)
                                verboseprint("Done Re-calculating BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

            except Exception as e:
                print("some error occurred")
                print(e)
                print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno))
                perfectMatchIdAllele2.append("ERROR")
                perfectMatchIdAllele.append("ERROR")

    # add new alleles to the locus fasta file
    if len(newDNAAlleles2Add2Fasta) > 5:
        with open(geneFile, 'a') as fG:
            fG.write(newDNAAlleles2Add2Fasta)
    if len(newDNAAlleles2Add2shortFasta) > 5:
        with open(shortgeneFile, 'a') as fG:
            fG.write(newDNAAlleles2Add2shortFasta)

    final = (resultsList, perfectMatchIdAllele)
    verboseprint("Finished allele calling at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))
    filepath = os.path.join(temppath, os.path.basename(geneFile) + "_result.txt")
    filepath2 = os.path.join(temppath, os.path.basename(geneFile) + "_result2.txt")
    with open(filepath, 'wb') as f:
        pickle.dump(final, f)
    with open(filepath2, 'wb') as f:
        pickle.dump(perfectMatchIdAllele2, f)
    shutil.rmtree(basepath)
    return True
def getBlastScoreRatios(genefile, basepath, doAll, verbose, blastPath):
    if verbose:
        def verboseprint(*args):
            for arg in args:
                print(arg),
            print
    else:
        verboseprint = lambda *a: None  # do-nothing function

    allelescores = []
    alleleProt = ''
    alleleAllProt = ''
    alleleList = []
    alleleI = 0
    alleleIlist = []
    listAllelesNames = []
    # calculate bsr for each allele
    for allele in SeqIO.parse(genefile, "fasta"):

        # usually first allele name is just > 1 and after that it has > gene_id_genome
        aux = allele.id.split("_")
        if len(aux) < 2:
            alleleI = str(aux[0])
        else:
            alleleI = str(aux[-1])

        # try to translate the allele
        alleleIlist.append(alleleI)
        alleleList.append(str(allele.seq.upper()))
        listAllelesNames.append(allele.id)

        translatedSequence, x = translateSeq(str(allele.seq.upper()))

        if translatedSequence == '':
            print("cannot translate allele on bsr calculation")
            pass

        # calculate BSR for the allele
        else:
            alleleProt = ">" + str(alleleI) + "\n" + str(translatedSequence + "\n")
            alleleAllProt += ">" + str(alleleI) + "\n" + str(translatedSequence + "\n")
            proteinfastaPath = os.path.join(basepath, str(os.path.basename(genefile) + '_protein2.fasta'))

            # new db for each allele to blast it against himself
            Gene_Blast_DB_name = CommonFastaFunctions.Create_Blastdb_no_fasta(proteinfastaPath, 1, True, alleleProt)

            # if bsr hasn't been calculated, do the BLAST
            if doAll:

                verboseprint("Starting Blast alleles at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

                # --- get BLAST score ratio --- #
                cline = NcbiblastpCommandline(cmd=blastPath, db=Gene_Blast_DB_name,
                                              evalue=0.001, outfmt=5, num_threads=1)
                out, err = cline(stdin=alleleProt)
                psiblast_xml = StringIO(out)
                blast_records = NCBIXML.parse(psiblast_xml)

                allelescore = 0

                verboseprint("Blasted alleles at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

                for blast_record in blast_records:

                    for alignment in blast_record.alignments:

                        for match in alignment.hsps:
                            allelescores.append(int(match.score))

                geneScorePickle = os.path.abspath(genefile) + '_bsr.txt'
                verboseprint("________")
                var = dict(zip(alleleIlist, allelescores))
                with open(geneScorePickle, 'wb') as f:
                    pickle.dump(var, f)

            # bsr had already been calculated, load it to memory
            else:
                geneScorePickle = os.path.abspath(genefile) + '_bsr.txt'
                with open(geneScorePickle, 'rb') as f:
                    var = pickle.load(f)
                # needs to convert dictionaries that have integer keys
                # to string keys
                var = {str(k): v for k, v in var.items()}

    proteinfastaPath = os.path.join(basepath, str(os.path.basename(genefile) + '_protein.fasta'))
    with open(proteinfastaPath, "w") as f:
        f.write(alleleAllProt)

    # returning all allele BSR scores and list of alleles for this gene
    return var, alleleList, listAllelesNames
示例#4
0
def main(genes, sizethresh, cpuToUse, proteinFIlePath, outputFIlePath,
         BlastpPath, bsr, verbose):

    if verbose:

        def verboseprint(*args):

            for arg in args:
                print(arg),
            print
    else:
        verboseprint = lambda *a: None

    # translate to protein and create new file
    abspath = os.path.abspath(genes)
    filename = os.path.basename(genes)
    abspath = abspath.replace(filename, '')
    proteinfile = os.path.join(abspath, 'proteins.fasta')

    geneDict = {}
    protDict = {}
    orderedprotDict = collections.OrderedDict()
    alreadyIn = []
    totalgenes = 0
    repeatedgenes = 0
    smallgenes = 0
    nottranslatable = 0

    verboseprint("Checking translatability of the loci:\n")

    if not proteinFIlePath:
        with open(proteinfile, "w") as f:

            for gene in SeqIO.parse(genes, "fasta"):
                dnaseq = str(gene.seq.upper())
                protseq, seq, y = translateSeq(dnaseq, gene.id)
                totalgenes += 1
                if len(protseq) > 1:

                    if str(protseq) in alreadyIn:
                        repeatedgenes += 1

                    elif len(str(seq)) < sizethresh:
                        smallgenes += 1

                    else:
                        alreadyIn.append(str(protseq))
                        protname = ">" + str(gene.id) + "\n"

                        f.write(protname + str(protseq) + "\n")
                        protDict[protname] = str(protseq)
                        geneDict[str(gene.name)] = dnaseq
                else:
                    nottranslatable += 1
                    continue

            verboseprint(
                str(nottranslatable) + " not translatable out of " +
                str(totalgenes))

            verboseprint("\nChecking if repeated protein sequences:\n")

            orderedprotList = []
            orderedprotList = sorted(protDict.items(),
                                     key=lambda x: len(x[1]),
                                     reverse=True)

            i = 0
            while i < len(orderedprotList):
                elem = orderedprotList[i]
                orderedprotDict[elem[0]] = elem[1]
                i += 1

        verboseprint(
            str(repeatedgenes) + " repeated loci out of " + str(totalgenes))
        verboseprint(
            str(smallgenes) + " loci out of " + str(totalgenes) +
            " smaller than " + str(sizethresh) + "bp")
        verboseprint("\nprotein file created\n")

        # first step -  remove genes contained in other genes or 100% equal genes
        # list of results - the output of the function
        resultsList = []

        auxDict = {}
        g = 0
        j = 0

        verboseprint(
            "Checking if protein sequences are contained in others...")

        # for each gene from all the annotated genes - starting with an empty dictionary, only add a new gene if the "to be added gene" is not contained or equal to a gene already added to the dictionary
        auxprot = []

        for elem in orderedprotDict.items():

            contained = False

            prot = str(elem[1])
            if any(prot in x for x in auxprot):
                g += 1
                contained = True

            else:
                auxDict[elem[1]] = elem[0]
                auxprot.append(str(elem[1]))

            j += 1
        verboseprint(str(g) + " loci are contained in other genes\n")

        # overwrite the original file, obtaining a new file with unique genes
        with open(proteinfile, "w") as f:
            allsequences = ''
            for k, v in auxDict.items():
                allsequences += v + k + "\n"
            f.write(allsequences)

    else:
        proteinfile = proteinFIlePath
        totalgenes = 0
        smallgenes = 0
        proteinfile = proteinFIlePath
        for gene in SeqIO.parse(genes, "fasta"):
            dnaseq = str(gene.seq.upper())

            protname = ">" + str(gene.id) + "\n"
            geneDict[str(gene.name)] = dnaseq

    verboseprint("Starting Blast")

    geneFile = os.path.abspath(proteinfile)
    Gene_Blast_DB_name = CommonFastaFunctions.Create_Blastdb(geneFile, 1, True)

    geneF = os.path.splitext(geneFile)[0]
    blast_out_file = geneF + '.xml'
    # ------------------------------ RUNNING BLAST ------------------------------ #
    if cpuToUse:
        cline = NcbiblastpCommandline(cmd=BlastpPath,
                                      query=geneFile,
                                      db=Gene_Blast_DB_name,
                                      evalue=0.001,
                                      out=blast_out_file,
                                      outfmt=5,
                                      num_threads=int(cpuToUse))
    else:
        cline = NcbiblastpCommandline(cmd=BlastpPath,
                                      query=geneFile,
                                      db=Gene_Blast_DB_name,
                                      evalue=0.001,
                                      out=blast_out_file,
                                      outfmt=5,
                                      num_threads=1)
    blast_records = CommonFastaFunctions.runBlastParser(cline, blast_out_file)
    verboseprint("Finished blast")

    toRemove = []
    genesToKeep = []
    log = ["removed\tcause\texplanation"]
    for blast_record in blast_records:

        allelename = blast_record.query
        allelename = allelename.split(" ")
        allelename = allelename[0]
        alleleLength = len(geneDict[allelename])

        try:
            # if gene A is not on the toRemove list yet, add to genesToKeep list
            if str(blast_record.query) not in toRemove:
                genesToKeep.append(blast_record.query)

                i = 0
                # if first alignement is not against self, gene B is bigger than gene A and very simillar - remove gene A from genesToKeep and add gene B instead
                if not str(blast_record.query) == str(
                    (blast_record.alignments[0]).hit_def):
                    genesToKeep.remove(str(blast_record.query))
                    toRemove.append(str(blast_record.query))
                    log.append(
                        str(blast_record.query) + "\t" +
                        str((blast_record.alignments[0]).hit_def) + "\t" +
                        "2 is first best match")

                    # if gene B is not on the toRemove list, add to genesToKeep list
                    if str(
                        (blast_record.alignments[0]).hit_def) not in toRemove:
                        genesToKeep.append(
                            str((blast_record.alignments[0]).hit_def))

                    raise Exception

                selfblastscore = (((blast_record.alignments[0]).hsps)[0]).score

                while i < len(blast_record.alignments):
                    align = blast_record.alignments[i]

                    match = (align.hsps)[0]
                    scoreRatio = float(match.score) / float(selfblastscore)

                    alleleLength2 = len(geneDict[str(align.hit_def)])

                    # if good match and gene B not in toremove list
                    if (scoreRatio > bsr and
                            not str(align.hit_def) == str(blast_record.query)
                            and str(align.hit_def) not in toRemove):

                        # if gene B is bigger than gene A, keep bigger gene B
                        if alleleLength2 > alleleLength:
                            genesToKeep.append(str(align.hit_def))
                            genesToKeep.remove(str(blast_record.query))
                            toRemove.append(str(blast_record.query))
                            log.append(
                                str(blast_record.query) + "\t" +
                                str(align.hit_def) + "\t" +
                                "2 is bigger and bsr >" + str(bsr))

                            raise Exception
                        # else add gene B to toremove list
                        elif str(align.hit_def) in genesToKeep:
                            genesToKeep.remove(str(align.hit_def))
                            toRemove.append(str(align.hit_def))
                            log.append(
                                str(align.hit_def) + "\t" +
                                str(blast_record.query) + "\t" +
                                "2 is bigger and bsr >" + str(bsr))

                    i += 1

            # else gene A is on toRemove list, add all similar genes (not in genesToKeep) list to the toRemove list
            else:

                i = 0
                selfblastscore = 0
                for align in blast_record.alignments:
                    if not (str(align.hit_def) == str(blast_record.query)):
                        selfblastscore = ((align.hsps)[0]).score
                        # print "gene "+str(align.hit_def)+" is larger than gene "+str(blast_record.query)
                        raise Exception

                while i < len(blast_record.alignments):
                    align = blast_record.alignments[i]
                    match = (align.hsps)[0]
                    scoreRatio = float(match.score) / float(selfblastscore)

                    if align.hit_def not in genesToKeep and not str(
                            align.hit_def) == str(
                                blast_record.query) and scoreRatio > bsr:
                        toRemove.append(align.hit_def)
                        log.append(
                            str(align.hit_def) + "\t" +
                            str(blast_record.query) + "\t" +
                            "2 was on the removed list and bsr >" + str(bsr))
                    else:
                        pass

                    i += 1

        except Exception as e:
            # print e
            pass

    genesToKeep = list(set(genesToKeep))
    toRemove = list(set(toRemove))
    s = set(toRemove)
    notcommonToKeep = [x for x in genesToKeep if x not in s]

    pathfiles = os.path.dirname(geneFile)
    pathfiles = pathfiles + "/"
    listfiles = []

    removedparalogs = 0
    removedsize = 0
    totalgenes = 0
    rest = 0
    concatenatedFile = ''
    schema_folder_path = os.path.join(pathfiles, 'schema_seed')

    if not os.path.exists(
            schema_folder_path) and not proteinFIlePath and not outputFIlePath:
        os.makedirs(schema_folder_path)
    elif not proteinFIlePath and outputFIlePath:
        os.makedirs(outputFIlePath)

    for contig in SeqIO.parse(genes, "fasta"):
        totalgenes += 1
        name2 = contig.id

        if name2 not in toRemove and name2 in genesToKeep:
            if int(len(contig.seq)) > sizethresh:
                namefile = contig.name
                namefile = namefile.replace("|", "_")
                namefile = namefile.replace("_", "-")
                namefile = namefile.replace("(", "")
                namefile = namefile.replace(")", "")
                namefile = namefile.replace("'", "")
                namefile = namefile.replace("\"", "")
                namefile = namefile.replace(":", "")

                if not proteinFIlePath and not outputFIlePath:
                    newFile = os.path.join(schema_folder_path,
                                           namefile + ".fasta")
                    listfiles.append(newFile)
                    with open(newFile, "w") as f:
                        f.write(">" + namefile + "_1\n" +
                                str(contig.seq).upper() + "\n")
                elif not proteinFIlePath and outputFIlePath:
                    newFile = os.path.join(outputFIlePath, namefile + ".fasta")
                    listfiles.append(newFile)
                    with open(newFile, "w") as f:
                        f.write(">" + namefile + "_1\n" +
                                str(contig.seq).upper() + "\n")
                else:
                    concatenatedFile += ">" + contig.id + " \n" + str(
                        contig.seq.upper()) + "\n"

                rest += 1

            else:
                removedsize += 1
        else:

            removedparalogs += 1

    if proteinFIlePath and outputFIlePath:
        with open(outputFIlePath, "w") as f:
            f.write(concatenatedFile)
    elif not proteinFIlePath and outputFIlePath:
        init_schema_4_bbaca.get_Short(listfiles)
        verboseprint("\nRemoved " + str(removedparalogs) +
                     " with a high similarity (BSR>" + str(bsr) + ")")
        print("\nCreated schema with " + str(rest) + " loci.")
        os.remove(proteinfile)

    # create short folder
    else:
        init_schema_4_bbaca.get_Short(listfiles)
        verboseprint("\nRemoved " + str(removedparalogs) +
                     " with a high similarity (BSR>" + str(bsr) + ")")
        print("\nCreated schema with " + str(rest) + " loci.")
        os.remove(proteinfile)

    shutil.rmtree(os.path.join(pathfiles, 'blastdbs'))

    os.remove(blast_out_file)