Exemplo n.º 1
0
def main():
    """
    ARGS:
    RETURN:
    DESCRIPTION:
    DEBUG:
    FUTURE:
    """
    if (sys.version_info[0] < 3):
        exit_with_error("ERROR!!! Runs with python3, NOT {}\n".format(
            sys.argv[0]))
    if (len(sys.argv) == 2 and ("--h" in sys.argv[1] or "-h" in sys.argv[1])):
        print_help(ExitCode=0)
    elif (len(sys.argv) != 2):
        print_help(ExitCode=1)
    path = sys.argv[1]  # Path to search

    # Master loop
    for root, dirL, fileL in os.walk(path):
        for f in fileL:
            if (platform.system() == "Darwin"):
                print(os.path.join(root, f))
            else:
                exit_with_error(
                    "ERROR!!! {} is an unsupported platform".format(
                        platform.system()))
Exemplo n.º 2
0
def displacement(Agent1=None, Agent2=None):
    """
    ARGS:
    RETURN:
    DESCRIPTION:
        Gives displacement between two agents.
    DEBUG:
    FUTURE:
    """
    if (type(Agent1) == AGENT):
        x1 = Agent1.posL[0]
        y1 = Agent1.posL[1]
    elif (type(Agent1) == list):
        x1 = Agent1[0]
        y1 = Agent1[1]
    else:
        exit_with_error("ERROR!!! {} not a supported type\n".format(
            type(Agent1)))

    if (type(Agent2) == AGENT):
        x2 = Agent2.posL[0]
        y2 = Agent2.posL[1]
    elif (type(Agent2) == list):
        x2 = Agent2[0]
        y2 = Agent2[1]
    else:
        exit_with_error("ERROR!!! {} not a supported type\n".format(
            type(Agent2)))

    return np.sqrt((x1 - x2)**2 + (y1 - y2)**2)
Exemplo n.º 3
0
    def __init__(self, Chromosome = None, Sequence = None):
        """
        ARGS:
            Chromosome = chromosome string
            Sequence   = chromosome sequence

        RETURN:
            NONE : Initializes CHROMOSOME

        DESCRIPTION:
            Gets chromosome sequence so that we can use it to get the exon
            sequences.

        DEBUG:
            For a shortened whole genome fasta file, it correctly reads it in.
            See read_genome() for debugging code and futher details.

        FUTURE: 
        """
        self.chrm= None         # str, Chromosome
        self.seq     = []           # str, sequence

        if(Chromosome is not None):
            self.chrm = Chromosome
        else:
            exit_with_error("ERROR! Chromosome is not specified!\n")
        if(Sequence is not None):
            self.seq = Sequence
        else:
            exit_with_error("ERROR! Sequence is not specified!\n")
Exemplo n.º 4
0
def get_trans_seq(transList, exonList):
    """
    ARGS:

    RETURN:
        None

    DESCRIPTION:
        Gets sequences for transcripts from chromosome list

    DEBUG: 
        Tested on 2 transcripts, more testing required. Getting a transcript file
        with the transcripts and sequences is challenging though

    FUTURE: 
    """
    timeBegin = datetime.datetime.now()
    for trans in transList:
        exonNum = 0  # use to check that indexs are loaded in order
        prevExonNum = 0
        for exon in exonList:
            if (exon.transID == trans.transID):
                exonNum = int(exon.exonNum)
                if (exonNum - prevExonNum != 1):
                    exit_with_error(
                        "ERROR! exon numbers for %s are loaded out of "
                        "order!\n" % (trans.transID))
                if (trans.seq is None):
                    trans.seq = exon.seq
                else:
                    trans.seq += exon.seq
                prevExonNum = exonNum
    timeEnd = datetime.datetime.now()
    print("get_trans_seq() run time = %s" % (timeEnd - timeBegin))
def main():
    timeBegin = time.time()
    if (len(sys.argv) != 7):
        if (len(sys.argv) > 1
                and (sys.argv[1] == "--help" or sys.argv[1] == "-h")):
            print_help(0)
        else:
            print_help(1)
    #pathToGtf = "/reference/homo_sapiens/GRCh38/ensembl/Annotation/Genes/gtf/Homo_sapiens.GRCh38.83.gtf"
    #pathToSeq = "/reference/homo_sapiens/GRCh38/ensembl/Sequence/WholeGenomeFasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa"
    random.seed(42)
    pathToGtf = sys.argv[1]
    pathToSeq = sys.argv[2]
    pathToConfig = sys.argv[3]
    pathToFastq = sys.argv[5]
    readType = sys.argv[6]

    gtfList = read_gtf(pathToGtf)
    exonList = get_exon_list(gtfList)
    transList = get_transcript_list(gtfList, exonList)
    geneList = get_gene_list(gtfList, transList)
    chrmList = read_genome(pathToSeq)
    uniqueFeatureList = get_list_of_unique_gtf_features(gtfList)
    get_exon_seq(exonList, chrmList)
    link_exons_trans_and_genes(gtfList, exonList, transList, geneList)
    # print_transcripts_with_seqs(transList)      # Debug link_exons_trans_and_genes()

    geneDict, transDict = create_gene_and_trans_lookup_dict(
        geneList, transList)
    print_gtf_statistics(exonList, transList, geneList)
    # find_trans_that_differ_by_1_exon(geneList, transList) # Uncomment for complete list
    readLength, desiredTransList, abundanceList, numOfReads = read_config(
        pathToConfig)

    numOfReads = int(sys.argv[4])

    if (readType != 'single' and readType != 'paired-fr-first'
            and readType != 'paired-fr-second'):
        exit_with_error("ERROR!!! Incorrect value for {}".format(readType))
    else:
        ### Paired end reads are not working yet ###
        if (readType == 'paired-fr-first' or readType == 'paired-fr-second'):
            exit_with_error(
                "ERROR!!! paired-fr-first and paired-fr-second \n"
                "not yet implemented. \n\n"
                "NOTE:: Both reads are tentatively found in the \n"
                "       INSERT class. The second read is not used.\n"
                "       The second read should definitely needs checked.\n")
        create_fastq_file(pathToFastq, desiredTransList, abundanceList,
                          numOfReads, readLength, transDict, transList,
                          exonList, readType)

    print("Unique features in Gtf : ")
    for feature in uniqueFeatureList:
        print("\t%s" % (feature))
    timeEnd = time.time()
    print("Run time : %s" % (timeEnd - timeBegin))
    sys.exit(0)
Exemplo n.º 6
0
def read_gtf(PathToGtf=None):
    """
    ARGS:
        PathToGtf = path to gene transfer file

    RETURN:
        A list of GTF_ENTRYs

    DESCRIPTION:
        Reads in gtf file.

    DEBUG:
        Can reproduce input gtf file. Works as expected.

    FUTURE: 
    """
    if (PathToGtf[-3:] != "gtf"):
        exit_with_error(
            "ERROR! You did not pass a file with the .gtf extention\n")

    gtfFile = open(PathToGtf, 'r')
    gtfList = []
    gtfEntry = None
    timeBegin = datetime.datetime.now()

    for line in gtfFile:
        if (line[0] == "#"):
            continue
        line = line.split("\t")
        # Format check
        if (len(line) != 9):
            exit_with_error(
                "ERROR! There should be 9 tab separated columns in a"
                " GTF file\nYou only have %i\n" % (len(line)))
        gtfEntry = GTF_ENTRY(Chromosome=line[0],
                             Source=line[1],
                             EntryType=line[2],
                             Start=line[3],
                             Stop=line[4],
                             Score=line[5],
                             Strand=line[6],
                             Frame=line[7],
                             Attribute=line[8])
        gtfList.append(gtfEntry)

    gtfFile.close()
    timeEnd = datetime.datetime.now()
    # Debug Code
    #for gtfEntry in gtfList:
    #    gtfEntry.print_entry()
    print("read_gtf()      run time = %s" % (timeEnd - timeBegin))
    return gtfList
Exemplo n.º 7
0
def parse_argv(argv):
    flag_1 = 0
    flag_2 = 0
    equation = ""
    argc = len(argv)
    # вывод ошибки при отстутствии аргументов
    if argc == 1:
        exit_with_error(-1)
    elif sys.argv[1] == "-h":
        printUsage()
        sys.exit(2)
    elif sys.argv[1] == "-p" or sys.argv[1] == "-P":
        if argc == 2:
            exit_with_error(-1)
        elif argc > 3:
            exit_with_error(-2)

        # бонусы
        if sys.argv[1] == "-p":
            flag_1 = 1
        else:
            flag_2 = 1
        equation = sys.argv[2]
    elif argc == 2:
        equation = sys.argv[1]
    else:
        exit_with_error(-2)

    return equation, flag_1, flag_2
Exemplo n.º 8
0
def check(eq):
    s = re.sub('[^0-9X. \-+*^=]', '', eq)
    if s != eq:
        exit_with_error(-2)

    eq = re.sub(' ', '', eq)
    if eq.count('=') > 1:
        exit_with_error(-3)
    elif eq.count('=') == 0:
        exit_with_error(-4)

    if len(eq) < 3:
        exit_with_error(-5)

    if not eq[-1].isdigit():
        exit_with_error(-5)
Exemplo n.º 9
0
def get_exon_seq(exonList, chrmList):
    """
    ARGS:

    RETURN:
        None

    DESCRIPTION:
        Gets sequences for exons from chromosome list

    DEBUG: 
        Spot checked 3 exons, are all ok. More testing needed, however it is challenging
        to get a list of all the exons (incl. seqs) in a single file

    FUTURE: 
    """
    timeBegin = datetime.datetime.now()
    for exon in exonList:
        for chrm in chrmList:
            chrmLen = len(chrm.seq)

            if (chrm.chrm == exon.chrm):
                start = exon.start - 1  # -1 b/c python is 0 indexed, gtf file isnot
                end = exon.stop
                if (start >= chrmLen or end >= chrmLen):
                    exit_with_error(
                        "ERROR!! start (%i) or stop (%i) Position > "
                        "chromosome length (%i)\n" % (start, end, chrmLen))
                if (exon.strand == '+'):
                    exon.seq = chrm.seq[start:end]
                elif (exon.strand == '-'):
                    exon.seq = reverse_complement(chrm.seq[start:end])
                    tmp = exon.start
                    exon.start = exon.stop
                    exon.stop = tmp
                else:
                    exit_with_error("ERROR! strand char = %s is invalid",
                                    exon.strand)
    timeEnd = datetime.datetime.now()
    print("get_exon_seq()  run time = %s" % (timeEnd - timeBegin))
Exemplo n.º 10
0
def reverse_complement(seq):
    """
    ARGS:
        seq : sequence with _only_ A, T, C or G (case sensitive)

    RETURN:
        rcSeq : reverse complement of sequenced passed to it.

    DESCRIPTION:

    DEBUG: 
        Compared several sequences.  Is working.

    FUTURE: 
    """ 
    rcSeq = ""            # Reverse Complement sequence
    # Complement
    for char in seq:
        if(char == 'A' ):
            rcSeq += 'T' 
            continue
        if(char == 'T' ):
            rcSeq += 'A' 
            continue
        if(char == 'G' ):
            rcSeq += 'C' 
            continue
        if(char == 'C' ):
            rcSeq += 'G' 
            continue
        if(char == 'N' ):
            rcSeq += 'N' 
            continue

        if(char not in "ATCGN"):
            exit_with_error("ERROR! char %s is not a valid sequencing character!\n"%(char))
    # Revese
    rcSeq = rcSeq[::-1]
    return rcSeq
Exemplo n.º 11
0
def main():
    """
    ARGS:

    RETURN:
    DESCRIPTION:
    NOTES:
    DEBUG:
    FUTURE:
    """
    ### Check Python version and CL args ###
    if (sys.version_info[0] != 3):
        exit_with_error("ERROR!!! Runs with python3, NOT python-{}\n\n".format(
            sys.version_info[0]))
    nArg = len(sys.argv)
    if (nArg == 2 and (sys.argv[1][0:3] == "--h" or sys.argv[1][0:2] == "-h")):
        print_help(0)
    elif (nArg != 1):
        print_help(1)
    ### Variables ###
    matrixSize = 5000  # outer dim of mats to run matrix_multiply on
    N = 50  # shortened num of trials to test,get stdev and mean

    random.seed(42)
    np.random.seed(42)
    Ax = matrixSize
    Ay = 10000
    Bx = 10000
    By = matrixSize
    A = np.random.rand(Ax, Ay)
    B = np.random.rand(Bx, By)
    print("Matrix Size = [{} {}] x [{} {}] = multiplied {} times ".format(
        Ax, Ay, Bx, By, N))
    npStartTime = time.time()
    for i in range(N):
        AB = np.dot(A, B)
    print("Run time : {:.4f} s".format((time.time() - npStartTime)))
Exemplo n.º 12
0
def main():
    """
    ARGS:
    RETURN:
        1. Creates images. Turn into moving using ffmpeg, e.g. 
           ffmpeg -framerate 4 -pattern_type glob -i 'output/*.png' -c:v libx264 out.mp4
    DESCRIPTION:
    DEBUG:
    FUTURE:
        1. Add option to fit only a specific section of data.
        2. Make main loop NOT O(N^2). Maybe organize by position on a grid.
    """
    # Check Python version
    nArg = len(sys.argv)
    # Use python 3
    if (sys.version_info[0] != 3):
        exit_with_error("ERROR!!! Use Python 3\n")
    # Get options
    if (len(sys.argv) > 1 and "-h" in sys.argv[1]):
        print_help(0)
    elif (nArg != 1 and nArg != 2):
        print_help(1)
    elif (nArg == 1):
        quarantine = False
    elif (nArg == 2 and sys.argv[1] == "quarantine"):
        quarantine = True

    startTime = time.time()
    print("{} \n".format(sys.argv), flush=True)
    print("   Start Time : {}".format(
        time.strftime("%a, %d %b %Y %H:%M:%S ", time.localtime())),
          flush=True)
    ### Parameters to Change
    N = 200  # Number of Agents
    nDays = 100  # number of days in simulation
    dt = 0.25  # number of steps in a day, total steps = nDays / dt
    nStep = int(nDays / dt)
    infectTime = 14 / dt  # Infection time in units of steps
    asymptomaticTime = 5 / dt  # Infection time in units of steps
    prob = 0.125  # Probability of infecting agent within infectDist
    infectDist = 0.05  # Distance person must be within to get infected
    critMass = 20  # Number of people before instituting a quarantine
    nDayAsymptAndInfec = 2  # Number days asymptomatic AND infectious
    agentL = []
    nSuscL = []  # Number of susceptible per step
    nInfL = []  # Number of infected per step
    nRmL = []  # Number of removed per step
    startQuarantine = False
    print("Parameters : \n"
          "     N = {}\n"
          "     prob = {}\n"
          "     nDays = {}\n"
          "     nStep= {}\n"
          "     infectDist= {}\n"
          "     critMass= {}\n"
          "     nDayAsymptAndInfec = {}\n"
          "     quarantine= {}\n".format(N, prob, nDays, nStep, infectDist,
                                         critMass, nDayAsymptAndInfec,
                                         quarantine))

    # Initialize agents
    for n in range(N):
        agent = AGENT(n)
        agentL.append(agent)

    # Infect 1 agent
    agentL[0].infected = True
    agentL[0].start = 0

    # Simulation - O(N**2)
    for step in range(nStep):
        # Use plotting
        sxL = []  # Susceptible xL
        syL = []
        ixL = []  # Infected xL
        iyL = []
        rxL = []  # Removed xL
        ryL = []
        ### Only if quarentining infected individuals
        if (quarantine == True):
            qxL = []  # quarantine
            qyL = []

        for i in range(len(agentL)):
            agent = agentL[i]
            # Generate for plot
            #xL.append(agent.posL[0])
            #yL.append(agent.posL[1])
            # Susceptible
            if (agent.infected == False and agent.immune == False):
                sxL.append(agent.posL[0])
                syL.append(agent.posL[1])
            # Infected
            if (agent.infected == True):
                ixL.append(agent.posL[0])
                iyL.append(agent.posL[1])
            # Removed
            if (agent.immune == True):
                rxL.append(agent.posL[0])
                ryL.append(agent.posL[1])

            ### Only if quarentining infected individuals
            if (quarantine == True):
                if (len(ixL) == critMass):  # Critical mass to quarantine
                    startQuarantine = True
                # Quarentine after 2 days of infeciousness
                if ((agent.infected == True and step - agent.start -
                     asymptomaticTime >= nDayAsymptAndInfec / dt)
                        and startQuarantine == True):
                    agent.quarantine = True
                    agent.vL[0] = 0
                    agent.vL[1] = 0
                if (agent.quarantine == True):
                    qxL.append(agent.posL[0])
                    qyL.append(agent.posL[1])
                    # continue

            # Move
            move_agent(agent, agentL, infectDist, quarantine, dt)

            # 'Removed' Group. If survived, adjust time.
            if (step - agent.start > infectTime and agent.infected == True):
                agent.infected = False
                agent.immune = True
                if (quarantine == True):
                    agent.quarantine = False

            # Susceptible Group - Check if infected
            if (agent.infected == False
                    or (step - agent.start < asymptomaticTime)
                    or agent.quarantine == True):
                continue

            # Infectious Group - Try to infect someone
            for j in range(len(agentL)):
                # Skip self
                if (i == j or agentL[j].immune == True
                        or agentL[j].infected == True
                        or agentL[j].quarantine == True):
                    continue
                d = displacement(agent, agentL[j])
                if (d <= infectDist):
                    rng = random.uniform(0, 1)
                    if (rng < prob):
                        agentL[j].infected = True
                        agentL[j].start = step
                        agent.nInfect += 1

        # Plot
        fig, ax = plt.subplots()
        ax.scatter(sxL, syL, c="black", marker=".", label="Susceptible")
        ax.scatter(ixL, iyL, c="red", marker="^", label="Infected")
        # Add infection radius
        for i in range(len(ixL)):
            circle = Circle((ixL[i], iyL[i]), radius=infectDist)
            circle.set_edgecolor("red")
            circle.set_facecolor("none")
            ax.add_artist(circle)
        ax.scatter(rxL, ryL, c="blue", marker="s", label="Removed")
        ax.grid(True)
        #ax.legend([".", "^", "s"], ["Removed","Infected","Susceptible"], loc="best")
        ax.legend(loc=1)
        ax.set_xlim((0, 1))
        ax.set_ylim((0, 1))
        # State R0
        RL = np.asarray([a.nInfect for a in agentL if (a.immune == True)])
        if (np.sum(RL) > 0):
            R = np.mean(RL)
        else:
            R = 0
        ax.set_title(
            "Critical Fraction : {:<.1f}%\n{:<.2f} days, R = {:<.2f}".format(
                critMass / N * 100, step * dt, R))
        #plt.show()
        plt.savefig("tmp/{:04d}.png".format(step))
        plt.close('all')

        # Record number susceptible, infected and removed
        nSuscL.append(len(sxL))
        nInfL.append(len(ixL))
        nRmL.append(len(rxL))

    # Generate plot of SIR vs. Time
    fig, ax = plt.subplots()
    ax.plot(range(nStep), nSuscL, c="black", label="Susceptible")
    ax.plot(range(nStep), nInfL, c="red", label="Infected")
    ax.plot(range(nStep), nRmL, c="blue", label="Removed")
    ax.legend(loc=1)
    ax.xaxis.set_ticks([d * 1.0 / dt for d in range(nDays) if (d % 10 == 0)])
    ax.set_xticklabels([d for d in range(nDays) if (d % 10 == 0)])
    ax.set_title(
        "Critical Fraction : {:<.1f}%\nSusceptible-Infected-Removed vs. Time".
        format(critMass / N * 100, step * dt))
    ax.set_xlabel("Time (days)")
    ax.set_ylabel("Number of Agents")
    plt.savefig("tmp/SIR_vs_time.png")
    plt.close('all')

    print("Ended : %s" % (time.strftime("%D:%H:%M:%S")))
    print("Run Time : {:.4f} h".format((time.time() - startTime) / 3600.0))
    sys.exit(0)
Exemplo n.º 13
0
def link_exons_trans_and_genes(gtfList, exonList, transList, geneList):
    """
    ARGS:
        gtfList  : list of all GTF_ENTRYs
        exonList : list of all EXONS 
        transList: list of all TRANSCRIPTS
        geneList : list of all GENES

    RETURN:

    DESCRIPTION:
        Loops through gtfList and captures the indices of exons in exonList and passes
        it to the transcripts in transList.  Also captures indices of transcripts in
        transList and passes it to genes in geneList.
        
        Does this in one pass through gtfList and scales roughly O(N). Should be faster
        than previous versions.

    DEBUG: 
        1. I validated by using print_transcripts_with_seqs() and comparing against the
           biomart download for chromosome 1. My data file was _identical_ to biomart's. 

           For how this was done, see the debug comment in print_transcripts_with_seqs() 
        2. Checked Transcript.seq for reverse strand ('-') transcript. Used 
           ENST00000488147 it is correct.
        
    FUTURE: 
    """
    gIdx = 0  # Gene index, for geneList
    tIdx = 0  # Transcript index, for transList
    eIdx = 0  # Exon index, for exonList
    gtfListLen = len(gtfList)
    timeBegin = datetime.datetime.now()

    # Ugly / non-pythonic b/c cant find cleaner way of accessing the next gtfEntry in the list
    for i in range(len(gtfList)):
        if (gtfList[i].etype == "gene"):
            # Check that genes in geneList are same order as gtfList
            if (gtfList[i].geneID != geneList[gIdx].geneID):
                exit_with_error(
                    "ERROR! gtfList[%i].geneID = %s and geneList[%i].geneID = %s"
                    % (i, gtfEntry.geneID, gIdx, geneList[gIdx].geneID))
            j = i + 1

            # Get constituent transcripts between gene entries
            while (gtfList[j].etype != "gene"):
                if (gtfList[j].etype == "transcript"):

                    # Check that transcripts in transList are same order as gtfList
                    # Checking transcripts after gene in gtf _actually_ are members of the gene
                    # Add trans info to appropriate geneList[]
                    if (gtfList[j].transID == transList[tIdx].transID
                            and gtfList[i].geneID == transList[tIdx].geneID
                            and gtfList[i].geneID == geneList[gIdx].geneID):
                        geneList[gIdx].transList.append(
                            transList[tIdx].transID)
                        geneList[gIdx].transIdxList.append(tIdx)
                        k = j + 1

                        # Get constituent exons between transcript entries
                        while (gtfList[k].etype != "transcript"):
                            if (gtfList[k].etype == "exon"):
                                # Check exons in exonList are same order as gtfList
                                # Checking exons after trans in gtf are members trans
                                # Add exon info to appropriate transList[]
                                if (gtfList[k].transID
                                        == exonList[eIdx].transID
                                        and gtfList[i].geneID
                                        == exonList[eIdx].geneID
                                        and gtfList[i].geneID
                                        == geneList[gIdx].geneID):
                                    transList[tIdx].exonList.append(
                                        exonList[eIdx].exonID)
                                    transList[tIdx].exonIdxList.append(eIdx)
                                    eIdx += 1
                                else:
                                    exit_with_error(
                                        "ERROR! gtfList[%i].transID = %s and exonList[%i]."
                                        "transID = %s\n\tgtfList[%i].geneID = %s and "
                                        "transList[%i].geneID = "
                                        "%s\n\tand geneList[%i].geneID = %s\n"
                                        % (k, gtfList[k].transID, eIdx,
                                           exonList[eIdx].transID, k,
                                           gtfList[k].geneID, tIdx,
                                           transList[tIdx].geneID, gIdx,
                                           geneList[gIdx].geneID))
                            k += 1
                            if (k == gtfListLen):
                                break
                        tIdx += 1
                    else:
                        exit_with_error(
                            "ERROR! gtfList[%i].transID= %s and transList[%i].transID = "
                            "%s\n\tgtfList[%i].geneID = %s and transList[%i].geneID = "
                            "%s\n\tand geneList[%i].geneID = %s\n" %
                            (j, gtfList[j].transID, tIdx,
                             transList[tIdx].transID, j, gtfList[j].geneID,
                             tIdx, transList[tIdx].geneID, gIdx,
                             geneList[gIdx].geneID))
                j += 1
                if (j == gtfListLen):
                    break
            gIdx += 1

    # Now get transcript sequences.
    for trans in transList:
        trans.seq = ""
        for eIdx in trans.exonIdxList:
            trans.seq += exonList[eIdx].seq

    timeEnd = datetime.datetime.now()
    print("link_exons_trans_and_genes() run time = %s" % (timeEnd - timeBegin))
Exemplo n.º 14
0
def main():
    """
    ARGS:
    RETURN:
    DESCRIPTION:
    DEBUG:
        1. Checked adding countries' data, hopefully don't double count
           if france gets more  points in the regions.  CHecked summing
           of China's regions (since there isn't a global value)
           
           E.g. 
           grep -i china data/jhu/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv  | awk 'BEGIN{FS=","; SUM=0}{SUM+=$66}END{print SUM}'
3274
    FUTURE:
        1. Add option to fit only a specific section of data.
    """
    # Check Python version
    nArg = len(sys.argv)
    # Use python 3
    if(sys.version_info[0] != 3):
        exit_with_error("ERROR!!! Use Python 3\n")
    # Get options 
    if(nArg > 1 and "-h" in sys.argv[1]):
        print_help(0)
    elif(nArg != 1 ):
        print_help(1)

    startTime = time.time()
    print("{} \n".format(sys.argv),flush=True)
    print("   Start Time : {}".format(time.strftime("%a, %d %b %Y %H:%M:%S ",
                                       time.localtime())),flush=True)
    plotType = "log-lin"
    
    # Get args
    dataPath = "data/jhu/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
    countryL = ["us","spain","italy","china","korea, south","germany","france",
                "canada","united kingdom"]
    df = pd.read_csv(dataPath)
    lastDate = df.columns[-1]
    dataD = dict()      # A dictionary, keys = country, values=np array of deaths
    for country in countryL:
        for index, row in df.iterrows():
            # Select country specified
            if(row.values[1].lower() == country.lower()):
                vector = np.asarray(row.values[4:],dtype=np.float32)
                # Convert nan's to 0's, maybe wrong?
                for i in range(len(vector)):
                    if(np.isnan(vector[i])):
                        vector[i]=0

                # += b/c some countries, i.e. china don't have a single country val
                if(country in dataD.keys()):
                    # Add
                    dataD[country] += vector
                    # Debugging
                    if(country == 'us' and np.isnan(dataD[country][-1])):
                        print("{} {}".format(country,len(dataD[country])))
                    #### FIGURE OUT NAN at end of US ####
                    if(initLen != len(dataD[country])):
                        exit_with_error("ERROR!!! {} != {}\n".format(initLen,
                                        len(dataD[country])))
                else:
                    dataD[country] = vector
                    initLen = len(dataD[country])
                    
    
    # not every country got deaths at the same time. Let's shift the time
    # points to be starting at the first death(s)
    for country in dataD.keys():
        shiftDataL = []
        firstDeath = False
        dataV = dataD[country]

        for d in range(len(dataV)):
            if(dataV[d] > 0):
                dataD[country]=dataV[d:]
                break
    usDailyDeathA=[dataD['us'][i+1] - dataD['us'][i] for i in range(len(dataD['us'])-1)]
    print("\nDaily deaths (last 10 days) in US\n\t{}\n".format(usDailyDeathA[-10:]))
    print("Total deaths (last 10 days) in US\n\t{}\n".format(dataD['us'][-10:]))
    fig, ax = plt.subplots(1,1)
    ax.set_title("Covid-19 Deaths per country (ending {})".format(lastDate))

    # Loop through keys and plot
    idx=0
    lineStyleL=["-","--","-.",":", "solid"]
    for country in dataD.keys():
        lineStyle=lineStyleL[idx%len(lineStyleL)]
        xV = range(len(dataD[country]))
        yV = np.log(dataD[country])
        ax.plot(xV, yV, label="{}".format(country),ls=lineStyle)
        #ax.annotate([xV[-1], yV[-1]
        if(country == "united kingdom"):
            country = "UK"
        ax.annotate(country, xy=(xV[-1], yV[-1]), ha="center", va="center",
                    rotation=45)
        idx+=1

    # Generate data for different doubling times
    ## Doubling time = 1 day
    n=15
    xV = range(n)
    yV = np.log(np.asarray([1*2**x for x in xV]))
    ax.plot(xV, yV, label="2bl time=1 day",ls="solid",color="black")
    ax.annotate("1 day", xy=(xV[-1], yV[-1]), ha="center",
                va="center", rotation=45)
        
    ## Doubling time = 3 day
    n=35
    xV = range(n)
    yV = np.log(np.asarray([1*2**(x/3.0) for x in xV]))
    ax.plot(xV, yV, label="2bl time= 3 day",ls="solid",color="black")
    ax.annotate("3 day", xy=(xV[-1], yV[-1]), ha="center",
                va="center", rotation=45)

    ## Doubling time = 10 day
    n=45
    xV = range(n)
    yV = np.log(np.asarray([1*2**(x/10.0) for x in xV]))
    ax.plot(xV, yV, label="2bl time= 10 day",ls="solid",color="black")
    ax.annotate("10 day", xy=(xV[-1], yV[-1]), ha="center",
                va="center", rotation=45)
        
        

    # Generate Plot
    ax.set_xlabel("Time spanning days since first death")
    ax.set_ylabel("{}".format("ln(deaths)"))
    ax.legend()
    plt.show()

    print("Ended : %s"%(time.strftime("%D:%H:%M:%S")))
    print("Run Time : {:.4f} h".format((time.time() - startTime)/3600.0))



    sys.exit(0)
Exemplo n.º 15
0
def move_agent(Agent=None,
               AgentL=None,
               InfectDist=None,
               Quarantine=None,
               DeltaT=None):
    """
    ARGS:
        Agent      : AGENT, The AGENT whose trajectory we are computing
        AgentL     : List of AGENTs, Used for avoiding quarantined AGENTs
        InfectDist : Float, Infectiousness distance. Used as radius around
                     quarantined, infected AGENTs
        Quarantine : Boolean, do we quarantine infected agents?
        DeltaT     : Time interval.
    RETURN:
    DESCRIPTION:
        Moves agent. Applies implied boundary conditions [0,0,0] -> [1,1,1]
        If quarantine option used, I do an approximation of reality. 

        Reasons :
            1. Trying to avoid multiple infected individuals gets logically complicated
               to code.
            2. Avoiding the bounds [0,0,0] -> [1,1,1] and trying to avoid an infected
               individual adds yet another level of logical complexity.

        Solution :  
            1. I only avoid the first infected individual that I encounter and then I test
               the bounds.


    DEBUG:
    FUTURE:
    """
    xi = Agent.posL[0]
    yi = Agent.posL[1]
    vx = Agent.vL[0]
    vy = Agent.vL[1]
    v = np.sqrt(vx**2 + vy**2)
    xf = Agent.vL[0] * DeltaT + Agent.posL[0]
    yf = Agent.vL[1] * DeltaT + Agent.posL[1]
    r = InfectDist  # Radius about quarantined individual

    # Check if quarantined agent nearby.
    #   1. Only consider 1st quarantine encountered b/c it could be potentially very
    #      challenging to solve for preventing a susceptible from completely avoiding _all_
    #      quarantined agents.
    #   2.
    #   3.

    # Quarantined agents can't move.
    if (Agent.quarantine == True and Agent.immune == True):
        return

    if (Quarantine == True and Agent.infected == False):
        # displacement, Agent final - initial
        dfi = np.sqrt((xf - xi)**2 + (yf - yi)**2)
        # Get line function,     y = mx + b
        m = (yf - yi) / (xf - xi)  # Slope of line
        b = yf - m * xf  # Pick a point on the line, solve for intercept

        for agent in AgentL:
            # Must be quarantined to avoid
            if (agent.quarantine == False):
                continue
            xc = agent.posL[0]
            yc = agent.posL[1]
            # displ, quarntined - Agent final
            dfq = np.sqrt((xc - xf)**2 + (yc - yf)**2)
            # displ, quarntined - Agent initial
            diq = np.sqrt((xc - xi)**2 + (yc - yi)**2)
            # There might be a collision -
            #   It is possible that both dfq and diq are
            #   outside of radius, yet the trajectory passes through it

            #### Maybe easier if we just say if dfi < 3*r and dfq < 3*r
            if (dfq <= r or diq <= r):  # This is gross and imprecise.
                # Get circle of exclusion line, recall
                #   0 = (x - xc)^2 + (y - yc)^2 - r^2
                #   xc,yc = x,yposition of center of circle
                def f(x):
                    y = m * x + b
                    return ((x - xc)**2 + (y - yc)**2 - r**2)

                ### With many root solvers, it requires that f(a)*f(b) < 0. However,
                ### fsolve doesn't care.  It just needs bounds to look
                xroots = optimize.fsolve(f, [xc - r, xc + r])
                # If there are two roots, which do i pick? Pick closest to Agent
                if (len(xroots) == 2):
                    x1 = xroots[0]
                    y1 = m * x1 + b
                    d1 = displacement(Agent, [x1, y1])
                    x2 = xroots[1]
                    y2 = m * x2 + b
                    d2 = displacement(Agent, [x2, y2])
                    # Use 1st root b/c it is closer
                    if (d1 < d2):
                        x = x1
                        y = y1
                    else:
                        x = x2
                        y = y2

                elif (len(xroots) == 1):
                    x = xroot
                    y = m * xroot + b
                else:
                    exit_with_error(
                        "ERROR!!! I don't understand how there can "
                        "be more than 2 roots!\n")
                rx = x - xc
                ry = y - yc
                rvect = [rx, ry]
                # Find line perpendicular to rvect, i.e. tangent to the circle, call it 't'
                # Let :
                #       t     = a  \hat(i) + b  \hat(j)
                #       rvect = rx \hat(i) + ry \hat(j)
                # Solve equation :
                #       t \dot rvect = 0
                #       (a \hat(i) + b \hat(j))  \dot (rx \hat(i) + ry \hat(j))  = 0
                #       a * rx + b * ry = 0
                #       a = -(b * ry) / rx
                ### Verticle line
                if (rx == 0):
                    a = 0
                    b = 1
                    alpha = np.pi / 2  # 90deg, Angle between tangent and horizontal
                ### Horizontal line
                elif (ry == 0):
                    a = 1
                    b = 0
                    alpha = 0  # 0deg, Angle between tangent and horizontal
                ### Exerything else
                else:
                    b = 1
                    a = -b * ry / rx
                    alpha = np.arctan(
                        b / a)  # 0deg, Angle between tangent and horizontal
                    if (np.isnan(alpha)):
                        exit_with_error(
                            "ERROR!!! np.arctan({}/{}) == nan\n".format(b / a))

                #if(np.isclose(np.sqrt(rvect[0]*rvect[0]+rvect[1]*rvect[1]), r) == False):
                #    exit_with_error("ERROR!!! I don't know how |rvect| != |r|\n")

                # Now get angle between rvector and velocity vector
                theta = np.arccos((vx * rvect[0] + vy * rvect[1]) / np.sqrt(
                    (vx**2 + vy**2) * (rvect[0]**2 + rvect[1]**2)))
                # Angle of reflection w/r/t to the tangent line on circle
                phi = theta - np.pi / 2.0
                #phi = theta
                vx = v * np.sin(phi) * np.cos(alpha)
                vy = v * np.cos(phi * np.sin(alpha))
                #print("{:<.5f} {:<.5f} {:<.5f}".format(vx,vy,phi))
                xf = vx * DeltaT + xi
                yf = vy * DeltaT + yi
                Agent.vL[0] = vx
                Agent.vL[1] = vy
                break
                #else:
                #    continue

    # Check bounds
    if (xf < 0):
        xf = -1.0 * xf
        Agent.vL[0] = -1.0 * Agent.vL[0]
    if (yf < 0):
        yf = -1.0 * yf
        Agent.vL[1] = -1.0 * Agent.vL[1]
    if (xf > 1.0):
        d = xf - 1.0
        xf = xf - d
        Agent.vL[0] = -1.0 * Agent.vL[0]
    if (yf > 1.0):
        d = yf - 1.0
        yf = yf - d
        Agent.vL[1] = -1.0 * Agent.vL[1]
    # Adjust Position
    Agent.posL[0] = xf
    Agent.posL[1] = yf
    # Adjust velocity
    dvx = random.uniform(-1,
                         1) / 100.0  # Want crossing time to be about 25 steps
    dvy = random.uniform(-1, 1) / 100.0
    Agent.vL[0] += dvx
    Agent.vL[1] += dvy
Exemplo n.º 16
0
def solve(fac):
    sl = Sol
    disc = fac.b * fac.b - 4 * fac.a * fac.c

    if fac.max_degree > 2:
        exit_with_error(-6)
    elif fac.a == 0 and fac.b == 0 and fac.c != 0:
        sl.comment = '\nThis equation has no solutions :C \n'
    elif fac.a == 0 and fac.b == 0 and fac.c == 0:
        sl.comment = '\nThe solution to this equation is any value of X *o* \n'
    elif fac.a == 0 and fac.b != 0:
        sl.comment = '\nThe graph of your equation is a straight line, so there is only one solution \n'
        sl.n = 1
        sl.x.append(round(-fac.c / fac.b, 3))
    else:
        if disc == 0:
            if fac.a > 0:
                sl.comment = '\nYour equation graph is a parabola with branches up. ' \
                             'It touches the OX axis at the vertex, so there is only one solution \n'
            else:
                sl.comment = '\nYour equation graph is a parabola with branches down. ' \
                             'It touches the OX axis at the vertex, so there is only one solution \n'
            sl.n = 1
            sl.x.append(round(-fac.b / (2 * fac.a), 3))
        elif disc > 0:
            if fac.a > 0:
                sl.comment = '\nYour equation graph is a parabola with branches up. ' \
                             'It intersects the OX axis at two points, so there are two solutions \n'
            else:
                sl.comment = '\nYour equation graph is a parabola with branches down. ' \
                             'It intersects the OX axis at two points, so there are two solutions \n'
            sl.n = 2
            sl.x.append(round((-fac.b + math.sqrt(disc)) / (2 * fac.a), 3))
            sl.x.append(round((-fac.b - math.sqrt(disc)) / (2 * fac.a), 3))
        else:
            if fac.a > 0:
                sl.comment = '\nYour equation graph is a parabola with branches up. However, ' \
                             'it does not cross the OX axis! Wow! This means you have two complex roots *0* \n'
            else:
                sl.comment = '\nYour equation graph is a parabola with branches down. However, ' \
                             'it does not cross the OX axis! Wow! This means you have two complex roots *0* \n'
            sl.n = 2
            sl.x.append(
                str(round(-fac.b / (2 * fac.a), 3)) + ' + ' +
                str(round(math.sqrt(-disc) / (2 * fac.a), 3)) + ' * i')
            sl.x.append(
                str(round(-fac.b / (2 * fac.a), 3)) + ' - ' +
                str(round(math.sqrt(-disc) / (2 * fac.a), 3)) + ' * i')

    if sl.n == 0 or fac.if_i == 1:
        print(sl.comment)

    if sl.n > 0:
        print('The solution is:')

        if sl.n == 1:
            print('X = ', sl.x[0])
        elif sl.n == 2:
            print('X1 =', sl.x[0])
            print('X2 =', sl.x[1], '\n')

    if fac.if_p and sl.n >= 1 and disc >= 0:
        show_plot(sl.x, fac.a, fac.b, fac.c)
Exemplo n.º 17
0
    def __init__(self, Insert = None, ReadLength = None, MetaData = None,
                 ExonList = None, Direction = None):
        """
        ARGS:
            Insert     = an INSERT instance
            ReadLength = length of desired read.
            MetaData   = Read number
            ExonList   = 
            Direction  = either 'forward' or 'reverse'
                         'forward' : matches mRNA starting from 5' -> 3'
                                     (the way ribosome trascribes mRNA)
                         'reverse' : matches complement mRNA starting from 3' -> 5' 

                         E.g. 
                         5' =>=>=>=>=>=>=>=>=>=>=>=>=>=>=>=>=>=>=>=>=>=> 3'
                            -----> 'forward'
                                                     'reverse' <-------- 
        RETURN:
            NONE : Initializes GENE

        DESCRIPTION:
            Currently, synthetic reads are forced to be completely contained within
            the transcript. i.e, I am not permitting them to read into the
            adapters, we'll save that for future work

        NOTES :
            1. http://onetipperday.sterding.com/2012/07/how-to-tell-which-library-type-to-use.html

        DEBUG:
            1. See debugging comments from INSERT class

        FUTURE: 
            1. Test both types of paired end reads  
            2. Test reads/transcrips on the +/- strands
        """
        self.seq  = None
        self.qual = None
        self.readlen = None
        self.metadata = None
        self.readDirection  = Direction   # whether the fastq read is forward / reverse comp
        self.start  = 0      # coord wrt to chromosome start (gtf start def this way)
        self.stop   = 0      # coord wrt to chromosome stop (gtf stop def this way)

        # type check 
        if(not isinstance(Insert, INSERT)):
            exit_with_error("ERROR! Insert is not of class type INSERT\n")

        if(ReadLength is not None):
            # ReadLength = int(ReadLength)
            self.readlen = ReadLength
        else:
            exit_with_error("ERROR! ReadLength not specified!\n")

        self.get_qual()
        # ALSO : recall that Transcript.seq is always in the direction that
        # transcripts are transcribed.
        # See : http://onetipperday.sterding.com/2012/07/how-to-tell-which-library-type-to-use.html
        if(self.readDirection == "forward"):
            self.seq = Insert.seq[0:ReadLength]
            # start / stop are _inclusive_
            self.start = Insert.r1Start
            self.stop  = Insert.r1Stop  
            exonsSpannedL = Insert.r1ExonL
        elif(self.readDirection == "reverse"):
            seqLen = len (Insert.seq)
            self.seq = reverse_complement(Insert.seq[seqLen-ReadLength:seqLen])
            # start / stop are _inclusive_
            self.start = Insert.r2Start
            self.stop  = Insert.r2Stop
            exonsSpannedL = Insert.r2ExonL
        else:
            exit_with_error("ERROR!!! {} is invalid".format(self.readDirection))


        # Add useful information to reads
        # shouldn't be so complicated. will need to revise Transcript.ExonList to
        # simplify this part
        if(MetaData is None):
            exit_with_error("ERROR! MetaData not specified!\n")

        #exonsSpannedL = list(set(exonsSpannedL))
        self.metadata = "%s:trans:%s:start:%i:exons"%(MetaData,Insert.transcript.transID,
                        self.start)
        for exon in exonsSpannedL:
            self.metadata = "{}:{}:{}:{}".format(self.metadata, exon.exonID,
                            exon.start, exon.stop)
Exemplo n.º 18
0
    def __init__(self, Chromosome = None, Source = None, EntryType= None, Start = None,
                 Stop = None, Score = None, Strand = None, Frame = None,
                 Attribute = None):
        """
        ARGS:
            Chromosome  : Chromosome, can only be 1,2,...,X,Y
            Source      : Database or Project name for entry
            EntryType   : exon, transcript, CDS, gene, etc
            Start       : start position on chromosome
            Stop        : stop position on chromosome
            Score       : UNKNOWN
            Strand      : '+' (forward) or '-' (reverse)
            Frame       : 0 indexed position of first base of codon
            Attribute   : semicolon sep list of tag-value pairs
            
        RETURN:
            NONE : Initializes GTF_ENTRY

        DESCRIPTION:
    
        DEBUG:
            Tested by reading in gtf file and printing out list of GTF_ENTRYs.
            compared using full Homo_sapiens.GRCh38.83.gtf and the output 
            was _identical_.

        FUTURE: 
        """
        self.chrm    = None        # str, Chromosome
        self.src     = None        # str, Source of data
        self.etype   = None        # str, exon, transcript, CDS, gene
        self.start   = None        # int, Start position on chrm
        self.stop    = None        # int, End position on chrm
        self.score   = None        # str, expects empty field, ie : '.'
        self.strand  = None        # str, '+' (forward) or '-' (reverse)
        self.frame   = None        # str, 0 indexed position of first base of codon
        self.attribute = None      # str, semicolon sep list of tag-value pairs
        # below vars are parsed from attribute list
        self.geneID  = None        # str, Gene ID ... starts with ENSG..
        self.geneName= None        # str, common gene name
        self.transID = None        # str, transcript ID ... starts with ENST...
        self.transName= None       # str, transcript Name.. = gene common name + num
        self.exonID  = None        # str, exon ID ... starts with ENSE
        self.exonNum = None        # int, exon number
        self.biotype = None        # str, gene_biotype

        if(Chromosome is not None):
            self.chrm = Chromosome
        else:
            exit_with_error("ERROR! Chromosome not specified!\n")
        if(Source is not None):
            self.src = Source
        else:
            exit_with_error("ERROR! Source not specified!\n")
        if(EntryType is not None):
            self.etype = EntryType
        else:
            exit_with_error("ERROR! EntryType not specified!\n")
        if(Start is not None):
            self.start = int(Start)
        else:
            exit_with_error("ERROR! Start not specified!\n")
        if(Stop is not None):
            self.stop = int(Stop)
        else:
            exit_with_error("ERROR! Stop not specified!\n")
        if(Score is not None):
            self.score = Score
            # Check for empty field
            if(self.score != '.'):
                exit_with_error("ERROR! Score = %s. Expected empty field\n"%(self.score))
        else:
            exit_with_error("ERROR! Score not specified!\n")
        if(Strand is not None):
            self.strand = Strand
            if(self.strand != '+' and self.strand != '-'):
                exit_with_error("ERROR! Strand = %s, wrong format\n"%(self.strand))
        else:
            exit_with_error("ERROR! Strand not specified!\n")
        if(Frame is not None):
            self.frame = Frame
        else:
            exit_with_error("ERROR! Frame not specified!\n")
        if(Attribute is not None):
            self.attribute = Attribute.split("\n")[0]

            # Now parse attribute list for relevant fields. Create dictionary for easy lookup
            attributeDict = {}
            attributeSplit = self.attribute.split(";")[:-1]     # lop off last due to ; at end
            for attrEntry in attributeSplit:
                attrEntry = attrEntry.strip()                   # Eliminate white space on ends
                key   = attrEntry.split(" ")[0]
                value = attrEntry.split(" ")[1]
                attributeDict[key] = value
            # Every key is not necessarily in the attributeDict, handle accordingly
            try:  self.geneID   = attributeDict["gene_id"]
            except KeyError:  pass
            try:  self.geneName = attributeDict["gene_name"]
            except KeyError:  pass
            try:  self.transID  = attributeDict["transcript_id"]
            except KeyError:  pass
            try:  self.transName= attributeDict["transcript_name"]
            except KeyError:  pass
            try:  self.exonID   = attributeDict["exon_id"]
            except KeyError:  pass
            try:  self.exonNum  = int((attributeDict["exon_number"])[1:-1])
            except KeyError:  pass
            try:  self.biotype  = attributeDict["gene_biotype"]
            except KeyError:  pass
        else:
            exit_with_error("ERROR! Attribute not specified!\n")
Exemplo n.º 19
0
    def __init__(self, GtfEntry):
        """
        ARGS:
            GtfEntry = a single GTF_ENTRY class element

        RETURN:
            NONE : Initializes GENE

        DESCRIPTION:

        DEBUG:

        FUTURE: 
        """
        self.chrm   = None        # str, Chromosome
        self.start  = None        # int, Start position on chrm
        self.stop   = None        # int, End position on chrm
        self.strand = None        # str, '+' (forward) or '-' (reverse)
        self.geneID = None        # str, nominal geneID, but may belong to multiple genes
        self.geneName= None        # str, common gene name
        self.transList    = []    # transcript names that are part of this gene
        self.transIdxList = []    # list, use to quickly map to AllTransList
        transIdx    = 0
        
        # type check
        if(not isinstance(GtfEntry, GTF_ENTRY)):
            exit_with_error("ERROR! GtfEntry is not of class type GTF_ENTRY\n")

        if(GtfEntry.chrm is not None):
            self.chrm = GtfEntry.chrm
        else:
            exit_with_error("ERROR! GtfEntry.chrm is None\n")
        if(GtfEntry.start is not None):
            self.start = int(GtfEntry.start)
        else:
            exit_with_error("ERROR! GtfEntry.start is None\n")
        if(GtfEntry.stop is not None):
            self.stop = int(GtfEntry.stop)
        else:
            exit_with_error("ERROR! GtfEntry.stop is None\n")
        if(GtfEntry.strand is not None):
            self.strand = GtfEntry.strand
        else:
            exit_with_error("ERROR! GtfEntry.strand is None\n")
        if(GtfEntry.geneID is not None):
            self.geneID = GtfEntry.geneID
        else:
            exit_with_error("ERROR! GtfEntry.geneID is None\n")
        if(GtfEntry.geneName is not None):
            self.geneName = GtfEntry.geneName
        else:
            exit_with_error("ERROR! GtfEntry.geneName is None\n")
Exemplo n.º 20
0
    def __init__(self, GtfEntry):
        """
        ARGS:
            GtfEntry = a single GTF_ENTRY class element

        RETURN:
            NONE : Initializes TRANSCRIPT

        DESCRIPTION:

        FUTURE: 
        """
        self.seq    = None        # str, sequence
        self.chrm   = None        # str, Chromosome
        self.start  = None        # int, Start position on chrm
        self.stop   = None        # int, End position on chrm
        self.strand = None        # str, '+' (forward) or '-' (reverse)
        self.geneID = None        # str, nominal geneID, but may belong to multiple genes
        self.transID = None       # str, nominal transcript ID, may belong to mult. trans.
        self.transNum= None       # str, only number portion of transID for sorting by transcript num
        self.exonList= []         # exon names that are part of this transcript
        self.exonIdxList = []     # list, use to quickly map to AllExonList
        exonIdx     = 0

        self.copy = 1               # copy number of this particular transcript in the transcriptome
                                    # It is set to 1 at the moment.

        # type check
        if(not isinstance(GtfEntry, GTF_ENTRY)):
            exit_with_error("ERROR! GtfEntry is not of class type GTF_ENTRY\n")

        if(GtfEntry.chrm is not None):
            self.chrm = GtfEntry.chrm
        else:
            exit_with_error("ERROR! GtfEntry.chrm is None\n")
        if(GtfEntry.start is not None):
            self.start = int(GtfEntry.start)
        else:
            exit_with_error("ERROR! GtfEntry.start is None\n")
        if(GtfEntry.stop is not None):
            self.stop = int(GtfEntry.stop)
        else:
            exit_with_error("ERROR! GtfEntry.stop is None\n")
        if(GtfEntry.strand is not None):
            self.strand = GtfEntry.strand
        else:
            exit_with_error("ERROR! GtfEntry.strand is None\n")
        if(GtfEntry.geneID is not None):
            self.geneID = GtfEntry.geneID
        else:
            exit_with_error("ERROR! GtfEntry.geneID is None\n")
        if(GtfEntry.transID is not None):
            self.transID = GtfEntry.transID
            self.transNum = self.transID[4:]
        else:
            exit_with_error("ERROR! GtfEntry.transcriptID is None\n")
Exemplo n.º 21
0
def main():
    """
    ARGS:

    RETURN:
    DESCRIPTION:
    NOTES:
    DEBUG:
    FUTURE:
    """
    ### Check Python version and CL args ###
    if(sys.version_info[0] != 3):
        exit_with_error("ERROR!!! Runs with python3, NOT python-{}\n\n".format(
                        sys.version_info[0]))
    nArg = len(sys.argv)
    if(nArg == 2 and (sys.argv[1][0:3] == "--h" or sys.argv[1][0:2] == "-h")):
        print_help(0)
    elif(nArg != 4):
        print_help(1)
    startTime = time.time()
    print("Start Time : {}".format(time.strftime("%a, %d %b %Y %H:%M:%S ",
                                   time.localtime())))
    print("Logging run output to driver.log\n\n")
    ### Variables ###
    options    = sys.argv[1]
    workPath   = sys.argv[2]           # Path where all the output/work will be saved.
    refPath    = sys.argv[3]           # Path where all the ref data and indices are located
    ompNumThreadsL = [1,2,5,20]        # Cores used in OMP tasks
    matrixSizeL = [5000]     # outer dim of mats to run matrix_multiply on
    #matrixSizeL = [2000,3000,5000]     # outer dim of mats to run matrix_multiply on
    #rnaSeqSizeL = [10**4,10**5]
    rnaSeqSizeL = [10**5]
    nTrials     = 3                     # number of trials to test,get stdev and mean
    shortNTrials= 1                # shortened num of trials to test,get stdev and mean
    # Create work path dir if doesn't exist
    if(not os.path.isdir(workPath)):
        os.mkdir(workPath)

    ## In Linux singularity container add cores per socket and total cores
    ## to ompNumThreadsL
    if(shutil.which('lscpu') != None):
        # Record raw lscpu, lscpu -e and numactl --hardware
        lscpuLog=open("{}/lscpu.log".format(workPath), "a")
        cmd="lscpu"
        lscpuLog.write("\n{}:\n{}\n".format(cmd,subprocess.getoutput(cmd)))
        cmd="lscpu -e"
        lscpuLog.write("\n{}:\n{}\n".format(cmd,subprocess.getoutput(cmd)))
        cmd="numactl --hardware"
        lscpuLog.write("\n{}:\n{}\n".format(cmd,subprocess.getoutput(cmd)))
        lscpuLog.close()
    
        # other details
        cmd="lscpu | grep 'Core(s) per socket:' | awk '{print $4}'"
        coresPerSocket = int(subprocess.getoutput(cmd))
        cmd="lscpu  | grep '^CPU(s):' | awk '{print $2}'"
        totalCores = int(subprocess.getoutput(cmd))
        cmd="lscpu | grep 'NUMA node0 CPU' | awk '{print $4}'"
        ## Numa - node
        coresPerNuma = subprocess.getoutput(cmd)
        if('-' in coresPerNuma):
            coresPerNuma = coresPerNuma.split('-')
            coresPerNuma[0] = int(coresPerNuma[0])
            coresPerNuma[1] = int(coresPerNuma[1])
            coresPerNuma = coresPerNuma[1] - coresPerNuma[0] + 1
        elif(',' in coresPerNuma):  # Interleave off
            coresPerNuma = len(coresPerNuma.split(','))
        else:
            exit_with_error("ERROR!!! Format for coresPerNuma is not handled"
                            ": {}".format(coresPerNuma))
        ## Insert
        bisect.insort_left(ompNumThreadsL, coresPerNuma)
        bisect.insort_left(ompNumThreadsL, coresPerSocket)
        bisect.insort_left(ompNumThreadsL, totalCores)
        ompNumThreadsL=list(sorted(set(ompNumThreadsL)))
        print("Cores per NUMA : {}".format(coresPerNuma))
        print("Cores per socket : {}".format(coresPerSocket))
        print("Total Cores : {}".format(totalCores))
        print("Cores tested : {}".format(ompNumThreadsL))

    # Get operating system and list of cores (linux only) to take advantage of NUMA
    curOS = sys.platform
    if(curOS == 'darwin'):
        curOS = 'osx'         # Rename for my own selfish readability
    elif(curOS == 'linux'):
        cmd = "grep -P 'processor[\t ]' /proc/cpuinfo | cut -d: -f2 | tr -d ' '"
        coreIDL = subprocess.getoutput(cmd)
        coreIDL = [int(idx) for idx in coreIDL.split()]
        ompCoresIdD = dict() # List of list cores to use associated with ompNumThreadsL
        for nThread in ompNumThreadsL:
            ompCoresIdD[nThread] = get_core_ids(NumThreads = nThread)

    else:
        exit_with_error("ERROR!! {} is an unsupported operating system".format(curOS))

    if(options != 'all' and options != 'build_mat_mult_data' and 
       options != 'mat_mult_non_cache_opt' and options != 'local_memory_access' and
       options != 'mat_mult_cache_opt' and options != 'build_rnaseq_data' and
       options != 'align_rnaseq_tophat' and options != 'align_rnaseq_hisat' and
       options != 'cufflinks_assemble'  and options != 'cuffmerge' and
       options != 'cuffcompare' and options != 'cuffquant' and
       options != 'cuffnorm' and options != 'cuffdiff' and options != 'kelvin'
    ):
        exit_with_error("ERROR!!! {} is invalid option\n".format(options))


    ######## Run Tests ########
    if(options == 'all' or options == 'build_mat_mult_data'):
        nThread = 1
        print("Building data for matrix_multiply (time to run is for numpy's matrix mult.: ")
        print("--------------------------------------------------------")
        print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean",
              "stdev"))
        print("--------------------------------------------------------")
        ### Create directory structure in data
        outDirPrefix = "{}/data/matrix".format(workPath)
        if(not os.path.isdir(outDirPrefix)):
            os.mkdir(outDirPrefix)

        for size in matrixSizeL:
            outDir = "{}/{}".format(outDirPrefix,size)
            if(not os.path.isdir(outDir)):
                os.mkdir(outDir)
            runTimeV = np.zeros([shortNTrials])
            for tIdx in range(shortNTrials):   ### change to shortNTrials
                if(curOS == 'linux'):
                    taskset = "taskset -c {} ".format(ompCoresIdD[nThread])
                else:
                    taskset = ""
                cmd =  ("{} python3 src/matrix/matrix_generator.py {} 10000 "
                             "10000 {} {}".format(taskset, size, size, outDir))
                output = "{}\n".format(cmd)
                output = output + subprocess.getoutput(cmd)
                runTime = parse_run_time(output,workPath) # Run time
                runTimeV[tIdx]= runTime
            print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread,
                      np.mean(runTimeV), np.std(runTimeV)))
            print("--------------------------------------------------------")


    if(options == 'all' or options == 'mat_mult_cache_opt'):
        print("matrix_multiply (cache optimized using OpenMP) : ")
        print("--------------------------------------------------------")
        print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean",
              "stdev"))
        print("--------------------------------------------------------")

        ### Create directory structure in output
        outDirPrefix = "{}/output/matrix_cache_opt".format(workPath)
        if(not os.path.isdir(outDirPrefix)):
            os.mkdir(outDirPrefix)

        for size in matrixSizeL:
            outDir = "{}/{}".format(outDirPrefix,size)
            if(not os.path.isdir(outDir)):
                os.mkdir(outDir)

            for nThread in ompNumThreadsL:
                runTimeV = np.zeros([nTrials])
                #nThread = 10
                #size=2000
                for tIdx in range(nTrials):
                    if(curOS == 'linux'):
                        taskset = "taskset -c {} ".format(ompCoresIdD[nThread])
                    else:
                        taskset = ""
                    cmd =  ("export OMP_NUM_THREADS={}; {} "
                            "./src/matrix/matrix_multiply_cache_opt "
                            "{}/data/matrix/{}/A.txt {}/data/matrix/{}/B.txt  "
                            "{}".format(nThread,taskset,workPath,size,workPath,size,
                            outDir))
                    output = "{}\n".format(cmd)
                    output = output + subprocess.getoutput(cmd)
                    runTime = parse_run_time(output,workPath) # Run time
                    runTimeV[tIdx]= runTime
                print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread,
                      np.mean(runTimeV), np.std(runTimeV)))
            print("--------------------------------------------------------")


    if(options == 'all' or options == 'mat_mult_non_cache_opt'):
        print("matrix_multiply (non-cache optimized using OpenMP) : ")
        print("--------------------------------------------------------")
        print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean",
              "stdev"))
        print("--------------------------------------------------------")

        ### Create directory structure in output
        outDirPrefix = "{}/output/matrix_non_cache_opt".format(workPath)
        if(not os.path.isdir(outDirPrefix)):
            os.mkdir(outDirPrefix)

        for size in matrixSizeL:
            outDir = "{}/{}".format(outDirPrefix,size)
            if(not os.path.isdir(outDir)):
                os.mkdir(outDir)

            for nThread in ompNumThreadsL:
                runTimeV = np.zeros([nTrials])
                #nThread = 10
                #size=2000
                for tIdx in range(nTrials):
                    if(curOS == 'linux'):
                        taskset = "taskset -c {} ".format(ompCoresIdD[nThread])
                    else:
                        taskset = ""
                    cmd =  ("export OMP_NUM_THREADS={}; {} "
                            "./src/matrix/matrix_multiply_non_cache_opt "
                            "{}/data/matrix/{}/A.txt {}/data/matrix/{}/B.txt  "
                            "{}".format(nThread,taskset,workPath,size,workPath,
                            size,outDir))
                    output = "{}\n".format(cmd)
                    output = output + subprocess.getoutput(cmd)
                    runTime = parse_run_time(output,workPath) # Run time
                    runTimeV[tIdx]= runTime
                print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread,
                      np.mean(runTimeV), np.std(runTimeV)))
            print("--------------------------------------------------------")


    if(options == 'all' or options == 'build_rnaseq_data'):
        print("Building RNA-Seq Data sets : ")
        print("--------------------------------------------------------")
        print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean",
              "stdev"))
        print("--------------------------------------------------------")
        nThread = 1
        nSamp   = 3
        treatSampL = []
        wtSampL = []
        gtf="{}/chr1_short.gtf".format(refPath)
        genome ="{}/chr1_short.fa".format(refPath)
        configL=["config/config_wt_chr1.txt", "config/config_treat_chr1.txt"]

        # Create output directory structure
        outDir  = "{}/data/rnaseq".format(workPath)
        if(not os.path.isdir(outDir)):
            os.mkdir(outDir)
        outDir  = "{}/fastq/".format(outDir)
        if(not os.path.isdir(outDir)):
            os.mkdir(outDir)
        ## Loop
        for size in rnaSeqSizeL:
            runTimeV = np.zeros([nSamp*len(configL)])
            tIdx = 0
            for config in configL:
                for samp in range(nSamp):
                    ## Set output files
                    if("treat" in config):
                        if(not os.path.isdir("{}/{}".format(outDir,size))):
                            os.mkdir("{}/{}".format(outDir,size))
                        outFile = "{}/{}/treat_{}".format(outDir,size,samp)
                        treatSampL.append(outFile)
                    elif("wt" in config):
                        if(not os.path.isdir("{}/{}".format(outDir,size))):
                            os.mkdir("{}/{}".format(outDir,size))
                        outFile = "{}/{}/wt_{}".format(outDir,size,samp)
                        wtSampL.append(outFile)
                    else:
                        exit_with_error("ERROR!!! No correct config file found!\n")
                    if(curOS == 'linux'):
                        taskset = "taskset -c {} ".format(ompCoresIdD[nThread])
                    else:
                        taskset = ""
                    cmd =  ("export OMP_NUM_THREADS={}; "
                       "{} python3 src/simulate_fastq_data/simulate_fastq.py "
                       "{} {} {} {} {} single"
                       "".format(nThread, taskset, gtf, genome, config, size, outFile))
                    output = "{}\n".format(cmd)
                    output = output + subprocess.getoutput(cmd)
                    runTime = parse_run_time(output,workPath) # Run time
                    runTimeV[tIdx]= runTime
                    tIdx = tIdx + 1
            print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread,
                  np.mean(runTimeV), np.std(runTimeV)))
            print("--------------------------------------------------------")


    if(options == 'all' or options == 'align_rnaseq_tophat'):
        print("Aligning RNA-Seq Data sets with tophat : ")
        print("--------------------------------------------------------")
        print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean",
              "stdev"))
        print("--------------------------------------------------------")
        outDirPref = "{}/output/rnaseq".format(workPath)
        if(not os.path.isdir(outDirPref)):
            os.mkdir(outDirPref)
        outDirPref = os.path.abspath("{}/output/rnaseq/tophat".format(workPath)) 
        if(not os.path.isdir(outDirPref)):
            os.mkdir(outDirPref)

        inDirPref  = os.path.abspath("{}/data/rnaseq/fastq".format(workPath)) 
        if(not os.path.isdir(inDirPref)):
            exit_with_error("ERROR!!! fastq data does not exits. Run build_rnaseq_data option")
        bowtieIdxPath = "{}/Bowtie2Index/Homo_sapiens.GRC38".format(refPath)
        ## Loop
        for size in rnaSeqSizeL:
            sampFileL   = glob.glob("{}/{}/*.fq".format(inDirPref,size))
            if(not os.path.isdir("{}/{}".format(outDirPref,size))):
                os.mkdir("{}/{}".format(outDirPref,size))

            for nThread in [1]:     # Tophat is poorly parallelizable
                runTimeV = np.zeros([len(sampFileL)])
                tIdx = 0
                for samp in sampFileL:
                    sampDir = samp.split("/")[-1].split(".")[0]
                    ## Set output directory
                    outDir = "{}/{}/{}".format(outDirPref,size,sampDir)

                    if(curOS == "osx"):
                        # My OSX configuration b/c I use virtualenv
                        python2="source ~/.local/virtualenvs/python2.7/bin/activate;"
                        cmd =  (
                            "{}; time {} tophat2 -p {} -o {} {} {}"
                            "".format(python2,taskset, nThread, outDir,
                            bowtieIdxPath, samp))
                    elif(curOS == 'linux'):
                    #    # On CentOS, default python is 2.6.6
                    #    python2="/usr/bin/python"
                        taskset = "taskset -c {} ".format(ompCoresIdD[nThread])
                        cmd =  (
                            "time {} tophat2 -p {} -o {} {} {}"
                            "".format(taskset, nThread, outDir,
                            bowtieIdxPath, samp))
                    else:
                        exit_with_error("ERROR!!! OS not supported")
                    output = "{}\n".format(cmd)
                    output = output + subprocess.getoutput(cmd)
                    runTime = parse_run_time(output,workPath) # Run time
                    runTimeV[tIdx]= runTime
                    tIdx = tIdx + 1
                print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread,
                      np.mean(runTimeV), np.std(runTimeV)))
                print("--------------------------------------------------------")


    if(options == 'all' or options == 'align_rnaseq_hisat'):
        print("Aligning RNA-Seq Data sets with hisat : ")
        print("--------------------------------------------------------")
        print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean",
              "stdev"))
        print("--------------------------------------------------------")
        # Get directory structure
        outDirPref = "{}/output/rnaseq".format(workPath)
        if(not os.path.isdir(outDirPref)):
            os.mkdir(outDirPref)
        outDirPref = os.path.abspath("{}/output/rnaseq/hisat".format(workPath)) ## prefix
        if(not os.path.isdir(outDirPref)):
            os.mkdir(outDirPref)
        inDirPref  = os.path.abspath("{}/data/rnaseq/fastq".format(workPath))   ## prefix
        if(not os.path.isdir(inDirPref)):
            exit_with_error("ERROR!!! fastq data does not exits. Run build_rnaseq_data option")
        hisatIdxPath = "{}/HisatIndex/genome".format(refPath)
        ## Loop
        for size in rnaSeqSizeL:
            sampFileL   = glob.glob("{}/{}/*.fq".format(inDirPref,size))
            if(not os.path.isdir("{}/{}".format(outDirPref,size))):
                os.mkdir("{}/{}".format(outDirPref,size))

            for nThread in ompNumThreadsL:
                runTimeV = np.zeros([len(sampFileL)])
                tIdx = 0
                for samp in sampFileL:
                    sampDir = samp.split("/")[-1].split(".")[0]
                    ## Set output directory
                    outDir = "{}/{}/{}".format(outDirPref,size,sampDir)
                    if(curOS == 'linux'):
                        taskset = "taskset -c {} ".format(ompCoresIdD[nThread])
                    else:
                        taskset = ""
                    if(not os.path.isdir(outDir)):
                        os.mkdir(outDir)
                    cmd =  (
                        "time {} hisat2 -p {} --phred33 -x {} -U {} -S {}/output.sam"
                       "".format(taskset, nThread, hisatIdxPath, samp, outDir))
                    output = "{}\n".format(cmd)
                    output = output + subprocess.getoutput(cmd)
                    runTime = parse_run_time(output,workPath) # Run time
                    runTimeV[tIdx]= runTime
                    tIdx = tIdx + 1
                print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread,
                      np.mean(runTimeV), np.std(runTimeV)))
                print("--------------------------------------------------------")


    if(options == 'all' or options == 'cufflinks_assemble'):
        print("Assembling transcriptome using cufflinks: ")
        print("--------------------------------------------------------")
        print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean",
              "stdev"))
        print("--------------------------------------------------------")
        outDirPref = os.path.abspath("{}/output/rnaseq/".format(workPath)) ## prefix
        if(not os.path.isdir(outDirPref)):
            os.mkdir(outDirPref)
        outDirPref = os.path.abspath("{}/output/rnaseq/cufflinks".format(workPath)) ## prefix
        if(not os.path.isdir(outDirPref)):
            os.mkdir(outDirPref)
        inDirPref  = os.path.abspath("{}/output/rnaseq/tophat".format(workPath))   ## prefix
        gtf="{}/Homo_sapiens.GRCh38.83.gtf".format(refPath)
        ## Loop
        for size in rnaSeqSizeL:
            sampFileL   = glob.glob("{}/{}/*/accepted_hits.bam".format(inDirPref,size))
            if(not os.path.isdir("{}/{}".format(outDirPref,size))):
                os.mkdir("{}/{}".format(outDirPref,size))

            for nThread in ompNumThreadsL:
                runTimeV = np.zeros([len(sampFileL)])
                tIdx = 0
                for samp in sampFileL:
                    sampDir = samp.split("/")[-2].split(".")[0]
                    ## Set output directory
                    outDir = "{}/{}/{}".format(outDirPref,size,sampDir)
                    if(not os.path.isdir(outDir)):
                        os.mkdir(outDir)
                    if(curOS == 'linux'):
                        taskset = "taskset -c {} ".format(ompCoresIdD[nThread])
                    else:
                        taskset = ""
                    cmd =  (
                        "time {} cufflinks --num-threads {} -g {} --output-dir {} {}"
                       "".format(taskset, nThread, gtf, outDir, samp))
                    output = "{}\n".format(cmd)
                    output = output + subprocess.getoutput(cmd)
                    runTime = parse_run_time(output,workPath) # Run time
                    runTimeV[tIdx]= runTime
                    tIdx = tIdx + 1
                print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread,
                      np.mean(runTimeV), np.std(runTimeV)))
                print("--------------------------------------------------------")


    if(options == 'all' or options == 'cuffmerge'):
        print("Merging assembled transcriptomes using cuffmerge")
        print("--------------------------------------------------------")
        print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean",
              "stdev"))
        print("--------------------------------------------------------")
        outDirPref = os.path.abspath("{}/output/rnaseq/".format(workPath)) ## prefix
        if(not os.path.isdir(outDirPref)):
            os.mkdir(outDirPref)
        outDirPref = os.path.abspath("{}/output/rnaseq/cuffmerge".format(workPath)) ## prefix
        if(not os.path.isdir(outDirPref)):
            os.mkdir(outDirPref)
        inDirPref  = os.path.abspath("{}/output/rnaseq/cufflinks".format(workPath))   ## prefix
        gtf="{}/Homo_sapiens.GRCh38.83.gtf".format(refPath)
        genome="{}/Homo_sapiens.GRCh38.dna.primary_assembly.fa".format(refPath)
        curDir = os.path.dirname(os.path.realpath(__file__))
        
        ## Loop
        for size in rnaSeqSizeL:
            sampFileL   = glob.glob("{}/{}/*/transcripts.gtf".format(inDirPref,size))
            outDir = "{}/{}".format(outDirPref,size)

            if(not os.path.isdir(outDir)):
                os.mkdir(outDir)
            assemblyPath = "{}/assemblies.txt".format(outDir)
            if(not os.path.isfile(assemblyPath)):
                assemblyFile = open(assemblyPath, "w+")
                for samp in sampFileL:
                    assemblyFile.write("{}\n".format(samp))
                assemblyFile.close()

            for nThread in ompNumThreadsL:
                ## Consider adding nTrials here.
                runTimeV = np.zeros([1])
                tIdx = 0
                if(curOS == "osx"):
                    # My OSX configuration b/c I use virtualenv
                    python2="source ~/.local/virtualenvs/python2.7/bin/activate;"
                    cmd =  (
                        "{};"
                        "time  cuffmerge --num-threads {} -o {} "
                        "--ref-gtf {} --ref-sequence {} {}"
                        "".format(python2,nThread, outDir, gtf, genome,
                        assemblyPath))
                elif(curOS == "linux"):
                    # On CentOS, default python is 2.6.6
                    # python2="/usr/bin/python"
                    taskset = "taskset -c {} ".format(ompCoresIdD[nThread])
                    cmd =  (
                        "pwd; cd /tmp/; alias python='/usr/bin/python';"
                        "time {} cuffmerge --num-threads {} -o {} "
                        "--ref-gtf {} --ref-sequence {} {}; cd {}/../"
                        "".format(taskset, nThread, outDir, gtf, genome,
                        assemblyPath, curDir))
                else:
                    exit_with_error("ERROR!!! Unsupported OS.")
                output = "{}\n".format(cmd)
                output = output + subprocess.getoutput(cmd)
                runTime = parse_run_time(output,workPath) # Run time
                runTimeV[tIdx]= runTime
                tIdx = tIdx + 1
                print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread,
                      np.mean(runTimeV), np.std(runTimeV)))
                print("--------------------------------------------------------")


    if(options == 'all' or options == 'cuffcompare'):
        print("Comparing cufflinks gtf using cuffcompare")
        print("--------------------------------------------------------")
        print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean",
              "stdev"))
        print("--------------------------------------------------------")
        # Check and make directory structure
        outDirPref = os.path.abspath("{}/output/rnaseq/".format(workPath)) ## prefix
        if(not os.path.isdir(outDirPref)):
            exit_with_error("ERROR!!! Expecting {}/output/rnaseq. Must have run tophat "
                            "and cufflinks prior\n".format(workPath))
        outDirPref = os.path.abspath("{}/output/rnaseq/cuffcompare".format(workPath)) 
        if(not os.path.isdir(outDirPref)):
            os.mkdir(outDirPref)
        inDirPref  = os.path.abspath("{}/output/rnaseq/cufflinks".format(workPath))   ## prefix
        gtf="{}/Homo_sapiens.GRCh38.83.gtf".format(refPath)
        genome="{}/Homo_sapiens.GRCh38.dna.primary_assembly.fa".format(refPath)
        nThread = 1
        ## Loop
        for size in rnaSeqSizeL:
            sampFileL   = glob.glob("{}/{}/*/transcripts.gtf".format(inDirPref,size))
            outPref = "{}/{}".format(outDirPref,size)

            ## Consider adding nTrials here.
            runTimeV = np.zeros([1])
            tIdx = 0
            if(curOS == 'linux'):
                taskset = "taskset -c {} ".format(ompCoresIdD[nThread])
            else:
                taskset = ""
            cmd =  (
                    "time {} cuffcompare -o {} -r {} -R -C -V {}"
                    "".format(taskset,outPref, gtf, " ".join(sampFileL)))
            output = "{}\n".format(cmd)
            output = output + subprocess.getoutput(cmd)
            runTime = parse_run_time(output,workPath) # Run time
            runTimeV[tIdx]= runTime
            tIdx = tIdx + 1
            print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread,
                  np.mean(runTimeV), np.std(runTimeV)))
            print("--------------------------------------------------------")


    if(options == 'all' or options == 'cuffquant'):
        print("Quantifying gene expression using cuffquant")
        print("--------------------------------------------------------")
        print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean",
              "stdev"))
        print("--------------------------------------------------------")
        outDirPref = os.path.abspath("{}/output/rnaseq/".format(workPath)) ## prefix
        if(not os.path.isdir(outDirPref)):
            os.mkdir(outDirPref)
        outDirPref = os.path.abspath("{}/output/rnaseq/cuffquant".format(workPath)) ## prefix
        if(not os.path.isdir(outDirPref)):
            os.mkdir(outDirPref)
        inGtfDirPref  = os.path.abspath("{}/output/rnaseq/cuffmerge".format(workPath))   ## prefix
        inBamDirPref  = os.path.abspath("{}/output/rnaseq/tophat".format(workPath))   ## prefix
        ## Loop
        for size in rnaSeqSizeL:
            bamFileL  = glob.glob("{}/{}/*/accepted_hits.bam".format(inBamDirPref,size))
            outDir = "{}/{}".format(outDirPref,size)
            gtf="{}/{}/merged.gtf".format(inGtfDirPref,size)
            if(not os.path.isdir(outDir)):
                os.mkdir(outDir)

            for nThread in ompNumThreadsL:
                ## Consider adding nTrials here.
                runTimeV = np.zeros([len(bamFileL)])
                tIdx = 0

                for bamFile in bamFileL:
                    outDirSamp = "{}/{}".format(outDir,bamFile.split("/")[-2].split(".")[0])
                    if(not os.path.isdir(outDirSamp)):
                        os.mkdir(outDirSamp)
                    if(curOS == 'linux'):
                        taskset = "taskset -c {} ".format(ompCoresIdD[nThread])
                    else:
                        taskset = ""
                    cmd =  (
                        "time {} cuffquant --num-threads {} --output-dir {} "
                        "{} {}"
                            "".format(taskset, nThread, outDirSamp, gtf, bamFile))
                    output = "{}\n".format(cmd)
                    output = output + subprocess.getoutput(cmd)
                    runTime = parse_run_time(output,workPath) # Run time
                    runTimeV[tIdx]= runTime
                    tIdx = tIdx + 1
                print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread,
                      np.mean(runTimeV), np.std(runTimeV)))
                print("--------------------------------------------------------")


    if(options == 'all' or options == 'cuffnorm'):
        print("Quantifying gene expression using cuffnorm")
        print("--------------------------------------------------------")
        print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean",
              "stdev"))
        print("--------------------------------------------------------")
        outDirPref = os.path.abspath("{}/output/rnaseq/".format(workPath)) ## prefix
        if(not os.path.isdir(outDirPref)):
            os.mkdir(outDirPref)
        outDirPref = os.path.abspath("{}/output/rnaseq/cuffnorm".format(workPath)) ## prefix
        if(not os.path.isdir(outDirPref)):
            os.mkdir(outDirPref)
        inGtfDirPref  = os.path.abspath("{}/output/rnaseq/cuffmerge".format(workPath))   ## prefix
        inCxbDirPref  = os.path.abspath("{}/output/rnaseq/cuffquant".format(workPath))   ## prefix
        ## Loop
        for size in rnaSeqSizeL:
            cxbFileL  = glob.glob("{}/{}/*/abundances.cxb".format(inCxbDirPref,size))
            cxbFileL  = sorted(cxbFileL)    ## Break up into replicates
            # Get treat and wt groups
            sampNameL = [name.split('/')[-2] for name in cxbFileL]
            treatIdxL = ['treat_' in name for name in sampNameL]
            wtIdxL    = ['wt_' in name for name in sampNameL]
            treatCxbL = []
            wtCxbL    = []
            for idx in range(len(treatIdxL)):
                if(treatIdxL[idx] == True):
                    treatCxbL.append(cxbFileL[idx])
                elif(wtIdxL[idx] == True):
                    wtCxbL.append(cxbFileL[idx])
                else:
                    exit_with_error("ERROR!!! neither treatIdxL[idx] {} nor wtIdxL[idx] "
                                    "{} are" "True".format(treatIdxL[idx], wtIdxL[idx]))
            
            outDir = "{}/{}".format(outDirPref,size)
            gtf="{}/{}/merged.gtf".format(inGtfDirPref,size)
            if(not os.path.isdir(outDir)):
                os.mkdir(outDir)

            for nThread in ompNumThreadsL:
                ## Consider adding nTrials here.
                runTimeV = np.zeros([1])
                tIdx = 0
                if(curOS == 'linux'):
                    taskset = "taskset -c {} ".format(ompCoresIdD[nThread])
                else:
                    taskset = ""
                cmd =  (
                    "time {} cuffnorm --num-threads {} --output-dir {} -L {} "
                      " {} {} {}"
                      "".format(taskset, nThread, outDir, "treat,wt",  gtf, 
                                ",".join(treatCxbL), ",".join(wtCxbL)))
                #print(cmd)
                output = "{}\n".format(cmd)
                output = output + subprocess.getoutput(cmd)
                runTime = parse_run_time(output,workPath) # Run time
                runTimeV[tIdx]= runTime
                tIdx = tIdx + 1
                print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread,
                      np.mean(runTimeV), np.std(runTimeV)))
                print("--------------------------------------------------------")



    if(options == 'all' or options == 'cuffdiff'):
        print("Quantifying gene expression using cuffdiff")
        print("--------------------------------------------------------")
        print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean",
              "stdev"))
        print("--------------------------------------------------------")
        outDirPref = os.path.abspath("{}/output/rnaseq/".format(workPath)) ## prefix
        if(not os.path.isdir(outDirPref)):
            os.mkdir(outDirPref)
        outDirPref = os.path.abspath("{}/output/rnaseq/cuffdiff".format(workPath)) ## prefix
        if(not os.path.isdir(outDirPref)):
            os.mkdir(outDirPref)
        inGtfDirPref  = os.path.abspath("{}/output/rnaseq/cuffmerge".format(workPath))   ## prefix
        inCxbDirPref  = os.path.abspath("{}/output/rnaseq/cuffquant".format(workPath))   ## prefix
        ## Loop
        for size in rnaSeqSizeL:
            cxbFileL  = glob.glob("{}/{}/*/abundances.cxb".format(inCxbDirPref,size))
            cxbFileL  = sorted(cxbFileL)    ## Break up into replicates
            # Get treat and wt groups
            sampNameL = [name.split('/')[-2] for name in cxbFileL]
            treatIdxL = ['treat_' in name for name in sampNameL]
            wtIdxL    = ['wt_' in name for name in sampNameL]
            treatCxbL = []
            wtCxbL    = []
            for idx in range(len(treatIdxL)):
                if(treatIdxL[idx] == True):
                    treatCxbL.append(cxbFileL[idx])
                elif(wtIdxL[idx] == True):
                    wtCxbL.append(cxbFileL[idx])
                else:
                    exit_with_error("ERROR!!! neither treatIdxL[idx] {} nor wtIdxL[idx] "
                                    "{} are" "True".format(treatIdxL[idx], wtIdxL[idx]))
            
            outDir = "{}/{}".format(outDirPref,size)
            gtf="{}/{}/merged.gtf".format(inGtfDirPref,size)
            if(not os.path.isdir(outDir)):
                os.mkdir(outDir)

            
            # Cuffdiff is too time intensive to go over all threads
            for nThread in [ompNumThreadsL[0]]:  # Cheap hack iter over only nthread=1.  
                ## Consider adding nTrials here.
                runTimeV = np.zeros([1])
                tIdx = 0
                if(curOS == 'linux'):
                    taskset = "taskset -c {} ".format(ompCoresIdD[nThread])
                else:
                    taskset = ""
                cmd =  (
                    "time {} cuffdiff --num-threads {} --output-dir {} -L {} "
                      " {} {} {}"
                      "".format(taskset, nThread, outDir, "treat,wt",  gtf, 
                                ",".join(treatCxbL), ",".join(wtCxbL)))
                #print(cmd)
                output = "{}\n".format(cmd)
                output = output + subprocess.getoutput(cmd)
                runTime = parse_run_time(output,workPath) # Run time
                runTimeV[tIdx]= runTime
                tIdx = tIdx + 1
                print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread,
                      np.mean(runTimeV), np.std(runTimeV)))
                print("--------------------------------------------------------")





    # Note : 
    #   1. This will only run on Linux, not OSX
    #   2. Per John, it is near pointless to run multiple threads here.
    #      Just run it via his run_kelvin.sh, and leave my machinery out of it
    #   3. His script computes only the mean, but I'll shoe horn it into my
    #      reporting scheme
    if(options == 'all' or options == 'kelvin'):
        print("Runnning Kelving...")
        print("--------------------------------------------------------")
        print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Short", "OMP_Threads", "mean",
              "stdev"))
        print("--------------------------------------------------------")

        # Create output directory structure
        outDirPref  = "{}/output".format(workPath)
        if(not os.path.isdir(outDirPref)):
            os.mkdir(outDirPref)
        outDirPref  = "{}/kelvin".format(outDirPref)
        curDir = os.path.dirname(os.path.realpath(__file__))
        if(not os.path.isdir(outDirPref)):
            os.mkdir(outDirPref)
        nThread = 1
        runTimeV = np.zeros([1])

        ## Loop
        outDir = "{}".format(outDirPref)
        if(not os.path.isdir(outDir)):
            os.mkdir(outDir)

        cmd =  ("export LD_LIBRARY_PATH={}/kelvin/:$LD_LIBRARY_PATH;"
                "export PATH={}/kelvin/:$PATH;"
                "bash {}/kelvin/run_kelvin.sh {} {}/kelvin" # arg1 =outputdir, arg2=/path/to/kelvin.conf
                "".format(curDir, curDir, curDir, outDir, curDir))
        output = "{}\n".format(cmd)
        output = output + subprocess.getoutput(cmd)
        runTime = parse_run_time(output,workPath) # Run time
        runTimeV[0]= runTime
        print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format("Short", nThread,
              np.mean(runTimeV), np.std(runTimeV)))
        print("--------------------------------------------------------")



    print("Run Time for {} option : {:.4f} h\n\n".format(options,(time.time() - startTime)/3600.0))
    sys.exit(0)
Exemplo n.º 22
0
def create_insert(Transcript=None,
                  ReadLength=None,
                  Mu=None,
                  Sigma=None,
                  ExonList=None):
    """
    ARGS:
        Transcript : a TRANSCRIPT instance
        ReadLength : length of reads. Different from insert length
        Mu         : the mean of fragment length distribution
        Sigma      : the standard deviation of fragment length distribution
        ExonList   : 

    RETURN:
        AN INSERT of length n, where n fall in a distribution of rnorm(Mu,sigma)

    DESCRIPTION:

    DEBUG: 

    FUTURE: 
        1. Implement Proper solution where insert going into the Illumina adapter when
           stop - start < ReadLength
    """
    start = 0
    stop = 0
    timeBegin = datetime.datetime.now()
    transLength = len(Transcript.seq)

    # type check
    if (not isinstance(Transcript, TRANSCRIPT)):
        exit_with_error(
            "ERROR! Transcript is not of class type TRANSCRIPT 1\n")

    insertLength = 0

    # Ensure inserts are at least as long as the readlength
    while (insertLength < ReadLength):
        start = random.randint(0, transLength - 1)
        stop = start + int(numpy.random.normal(Mu, Sigma))
        # Avoid unrealistically short inserts
        if (stop - start < ReadLength):
            continue
        # Avoid inserts that are past end of transcripts.
        if (stop > transLength - 1):
            # Proper solution here would have insert going into the Illumina adapter
            stop = transLength - 1
            if (stop - start <
                    ReadLength):  # Insert must be at least as large as a read
                continue
        insert = INSERT(Transcript=Transcript,
                        StartWrtTrans=start,
                        StopWrtTrans=stop,
                        ReadLength=ReadLength,
                        ExonList=ExonList)

        insertLength = len(insert.seq)

    timeEnd = datetime.datetime.now()
    # print("get_insert_list() run time = %s"%(timeEnd - timeBegin))
    return insert
Exemplo n.º 23
0
    def __init__(self, Transcript = None, StartWrtTrans = 0, StopWrtTrans = 0,
                 ReadLength = 0, ExonList = None):
        """
        ARGS:
            Transcript = a TRANSCRIPT instance
            start = the start wrt to the transcript
            stop  = the stop wrt to the transcript
            sequence = the sequence of the insert

        RETURN:
            NONE : Initializes INSERT

        DESCRIPTION:
    
        DEBUG:
            1. Created a crappy shell script, extract_read.sh
               Looking at the reads in the file generated by 
               create_fastq_file() -> create_insert(), you'll see metadata look
               something like :

               @Read_num:0:trans:"ENST00000618181":start:935894:exons:"ENSE00002703998":935772:935896:"ENSE00002686739":939040:939129

               To check to see if the metadata correctly describes the read 
               (and by extension the read itself is likely correct), run :
    
               grep `bash src/simulate_fastq_data/extract_read.sh 935894 935772 \
                     935896 939040 939129` tmp.fq

               CONCLUSION : single end reads of transcripts/inserts on the '+'
                            strand in the sense direction work.

            2. Tested ENST00000488147 for the reverse strand of DNA
               --> Spot checked on ensembl reads with 1 or 2 exons. 
                   Appears to work, but there is a bug (see below)

        FUTURE: 
            1. Check that the Read 2 is correctly handled.
            2. BUG!!! The start and stop values put in the metadata when using
                      a transcript that is on the reverse strand is wrong. This 
                      needs fixed!
            3. Possible BUG : Added conditional to exclude duplicate exons 
               when exons are completely spanned. If I thought about the equalities
               harder, the additional conditional would likely be unnecessary
        """
        self.seq    = None        # str, sequence
        self.chrm   = None        # str, Chromosome
        self.start  = -1          # int, start position w/r/t the chromosome
        self.stop   = -1          # int, start position w/r/t the chromosome
        self.strand = None        # str, '+' (forward) or '-' (reverse)
        self.geneID = None        # str, nominal geneID, but may belong to multiple genes
        self.transID = None       # str, nominal transID, may belong to mult. trans.
        self.transNum= None       # str, only number portion of transID for sorting by transcript num
        if(StopWrtTrans - StartWrtTrans < ReadLength):
            exit_with_error("ERROR!! StopWrtTrans - StartWrtTrans ({}) < "
                            "ReadLength ({})\n".format(StopWrtTrans - StartWrtTrans,
                            ReadLength))
        # type check
        if(not isinstance(Transcript, TRANSCRIPT)):
            exit_with_error("ERROR! Transcript is not of class type TRANSCRIPT 2\n")
        else:
            self.transcript = Transcript

        if(Transcript.chrm is not None):
            self.chrm = Transcript.chrm
        else:
            exit_with_error("ERROR! Transcript.chrm is None\n")
        if(Transcript.strand is not None):
            self.strand = Transcript.strand
        else:
            exit_with_error("ERROR! Transcript.strand is None\n")
        if(Transcript.geneID is not None):
            self.geneID = Transcript.geneID
        else:
            exit_with_error("ERROR! Transcript.geneID is None\n")
        if(Transcript.transID is not None):
            self.transID = Transcript.transID
            self.transNum = self.transID[4:]
        else:
            exit_with_error("ERROR! Transcript.transcriptID is None\n")

        # get the sequence of the insert
        self.seq = Transcript.seq[StartWrtTrans:StopWrtTrans]

        # get the start and stop position of insert and associated reads (1 and 2)
        # relative to the chromosome. May discard read 2 if single end
        transPos= 0
        insertStart = 0
        insertStop  = 0
        exonL = [ExonList[idx] for idx in self.transcript.exonIdxList]
        if(Transcript.strand == '+'):     # Forward DNA strand
            exonL = sorted(exonL, key=operator.attrgetter('start'))
        elif(Transcript.strand == '-'):   # Reverse DNA strand
            exonL = sorted(exonL, key=operator.attrgetter('start'), reverse=True)
        else:
            exit_with_error("ERROR!! invalid value for Transcript.strand : {}".format(
                            Transcript.strand))
        exonSpanL= [] # List of exons spanned by insert
        # Read 1
        r1StartWrtTrans = StartWrtTrans 
        r1StopWrtTrans  = StartWrtTrans + ReadLength - 1 
        r1ExonSpanL = []
        r1Start = None
        r1Stop  = None
        # Read 2
        r2StartWrtTrans = StopWrtTrans
        r2StopWrtTrans  = StopWrtTrans - ReadLength + 1 
        r2ExonSpanL = []
        r2Start = None
        r2Stop  = None

        # Get all exons spanned by insert and reads 1 & 2.
        for exon in exonL:
            exonStart = transPos                    ## In transcript coords
            exonStop  = transPos + len(exon.seq)    ## In transcript coords
            ##### Insert #####
            ## Insert starts in exon
            if(StartWrtTrans >= exonStart and StartWrtTrans <= exonStop):
                exonSpanL.append(exon)
                insertStart = exon.start + (StartWrtTrans - exonStart)
            ## Insert spans exon
            if(StartWrtTrans <= exonStart and StopWrtTrans >= exonStop):
                exonSpanL.append(exon)
            ## Insert ends in exon
            if(StopWrtTrans >= exonStart and StopWrtTrans <= exonStop):
                exonSpanL.append(exon)
                insertStop = exon.start + (StopWrtTrans - exonStart)

            ##### Read 1 #####
            ## Insert starts in exon
            if(r1StartWrtTrans >= exonStart and r1StartWrtTrans <= exonStop):
                r1ExonSpanL.append(exon)
                r1Start = exon.start + (r1StartWrtTrans - exonStart)
            ## Insert spans exon
            if(r1StartWrtTrans <= exonStart and r1StopWrtTrans >= exonStop):
                if(exon not in r1ExonSpanL):
                    r1ExonSpanL.append(exon)
            ## Insert ends in exon
            if(r1StopWrtTrans >= exonStart and r1StopWrtTrans <= exonStop):
                # Prevent duplicates
                if(exon not in r1ExonSpanL):
                    r1ExonSpanL.append(exon)
                r1Stop = exon.start + (r1StopWrtTrans - exonStart)

            ##### Read 2 #####
            ## Insert starts in exon
            if(r2StopWrtTrans >= exonStart and r2StopWrtTrans <= exonStop):
                r2ExonSpanL.append(exon)
                r2Stop = exon.start + (r2StopWrtTrans - exonStart)
            ## Insert spans exon
            if(r2StopWrtTrans <= exonStart and r2StartWrtTrans >= exonStop):
                if(exon not in r2ExonSpanL):
                    r2ExonSpanL.append(exon)
            ## Insert ends in exon
            if(r2StartWrtTrans >= exonStart and r2StartWrtTrans <= exonStop):
                if(exon not in r2ExonSpanL):
                    r2ExonSpanL.append(exon)
                r2Start = exon.start + (r2StartWrtTrans - exonStart)

            transPos = transPos + len(exon.seq)

        ## Error Check ##
        if(len(r1ExonSpanL) == 0):
            exit_with_error("ERROR! Read 1 does _not_ span any exons!\n")
        if(len(r2ExonSpanL) == 0):
            exit_with_error("ERROR! Read 2 does _not_ span any exons!\n")
        if(r1Start is None or r1Stop is None or r2Start is None or r2Stop is None):
            exit_with_error("ERROR!! Invalid trans={}, r1Start,r1Stop,r2Start,r2Stop = "
                            "{},{},{},{}\n".format(Transcript.transID,r1Start,r1Stop,
                            r2Start,r2Stop))
        # Check for duplicate exons (bad!)
        if(len(r1ExonSpanL) != len(set(r1ExonSpanL))):
            exit_with_error("ERROR!! {} : len(r1ExonSpanL) {} != "
                            "len(set(r1ExonSpanL)) {}\n".format(Transcript.transID,
                            len(r1ExonSpanL),len(set(r1ExonSpanL))))
        if(len(r2ExonSpanL) != len(set(r2ExonSpanL))):
            exit_with_error("ERROR!! {} : len(r2ExonSpanL) {} != "
                            "len(set(r2ExonSpanL)) {}\n".format(Transcript.transID,
                            len(r2ExonSpanL),len(set(r2ExonSpanL))))
            
        self.start = insertStart    ## In chromosome coords
        self.stop  = insertStop     ## In chromosome coords
        self.exonL = exonSpanL      ## Exons spanned

        self.r1Start = r1Start      ## In chromosome coords
        self.r1Stop  = r1Stop       ## In chromosome coords
        self.r1ExonL = r1ExonSpanL  ## Exons spanned
        
        self.r2Start = r2Start      ## In chromosome coords
        self.r2Stop  = r2Stop       ## In chromosome coords
        self.r2ExonL = r2ExonSpanL  ## Exons spanned
Exemplo n.º 24
0
    def __init__(self, GtfEntry):
        """
        ARGS:
            GtfEntry = a single GTF_ENTRY class element

        RETURN:
            NONE : Initializes EXON 

        DESCRIPTION:

        DEBUG:

        FUTURE: 
        """
        self.seq    = None        # str, sequence
        self.chrm   = None        # str, Chromosome
        self.start  = None        # int, Start position on chrm
        self.stop   = None        # int, End position on chrm
        self.strand = None        # str, '+' (forward) or '-' (reverse)
        self.exonNum= None        # int, '1' indexed exon position w/r/t other exons
        self.exonID = None        # str, exon ID ... starts with ENSE
        self.geneID = None        # str, nominal geneID, but may belong to multiple genes
        self.transID = None  # str, nominal transcript ID, may belong to mult. trans.

        # type check
        if(not isinstance(GtfEntry, GTF_ENTRY)):
            exit_with_error("ERROR! GtfEntry is not of class type GTF_ENTRY\n")
            
        if(GtfEntry.chrm is not None):
            self.chrm = GtfEntry.chrm
        else:
            exit_with_error("ERROR! GtfEntry.chrm is None\n")
        if(GtfEntry.start is not None):
            self.start = int(GtfEntry.start)
        else:
            exit_with_error("ERROR! GtfEntry.start is None\n")
        if(GtfEntry.stop is not None):
            self.stop = int(GtfEntry.stop)
        else:
            exit_with_error("ERROR! GtfEntry.stop is None\n")
        if(GtfEntry.strand is not None):
            self.strand = GtfEntry.strand
        else:
            exit_with_error("ERROR! GtfEntry.strand is None\n")
        if(GtfEntry.exonNum is not None):
            self.exonNum = GtfEntry.exonNum
        else:
            exit_with_error("ERROR! GtfEntry.exonNum is None\n")
        if(GtfEntry.exonID is not None):
            self.exonID = GtfEntry.exonID
        else:
            exit_with_error("ERROR! GtfEntry.exonID is None\n")
        if(GtfEntry.geneID is not None):
            self.geneID = GtfEntry.geneID
        else:
            exit_with_error("ERROR! GtfEntry.geneID is None\n")
        if(GtfEntry.transID is not None):
            self.transID = GtfEntry.transID
        else:
            exit_with_error("ERROR! GtfEntry.transcriptID is None\n")
Exemplo n.º 25
0
def read_config(pathToConfig):
    """
    ARGS:
        pathToConfig : str, path to configuration file

    RETURN:
        readLength      = readlength desired
        desiredTransList= List of transcripts to use
        abundanceList   = list of relative abundances of transcripts

    DESCRIPTION:
        Config file format :
            1. Comment lines begin with '#'
            2. first non-header line begins with 'ReadLength'
            3. All subsequent lines must be transcripts with relative abundance
               The relative abundance can be any integer. Scaling is done 
               automatically. 
               E.g.
                    ENST00000488147 10
                    ENST00000473358 5
    DEBUG: 
        For small config files it reads in all the fields correctly.

    FUTURE: 
    """
    desiredTransList = []
    abundanceList = [
    ]  # integers used to get relative abundance of transcripts
    readLength = 0
    numOfReads = 0

    configFile = open(pathToConfig, 'r')
    for line in configFile:
        if (line[0] == "#"):
            continue
        line = (line.split("\n"))[0]  # remove trailing \n

        # Check for tabs, only spaces permitted
        if (re.search('\t', line)):
            exit_with_error("ERROR! Tabs not permitted in config file!\n")
        line = line.split(" ")

        # ReadLength
        if (line[0] == "ReadLength"):
            if (readLength == 0):
                readLength = int(line[1])
                continue
            else:
                exit_with_error(
                    "ERROR! multiple instances of ReadLength in config "
                    "file\n")

        # NumberOfReads
        if (line[0] == "NumberOfReads"):
            if (numOfReads == 0):
                numOfReads = int(line[1])
                continue
            else:
                exit_with_error(
                    "ERROR! multiple instances of ReadLength in config "
                    "file\n")

        # Transcripts
        if (re.search('ENST', line[0])):
            desiredTransList.append(line[0])
            abundanceList.append(int(line[1]))
        else:
            exit_with_error("ERROR! Incorrect transcript entry : %s\n"
                            " All entries should begin with 'ENST'\n" % (line))

    if (readLength == 0 or numOfReads == 0):
        exit_with_error("ERROR! ReadLength or NumberOfReads not specified in "
                        "config.txt\n")

    print("Config File Parameters : \nReadLength : %i\nNumberOfReads : %i" %
          (readLength, numOfReads))
    i = 0
    for trans in desiredTransList:
        print("%s %i" % (trans, abundanceList[i]))
        i += 1
    print("\n")

    return readLength, desiredTransList, abundanceList, numOfReads
Exemplo n.º 26
0
def main():
    """
    ARGS:
    RETURN:
    DESCRIPTION:
    DEBUG:
    FUTURE:
        1. Add option to fit only a specific section of data.
    """
    # Check Python version
    nArg = len(sys.argv)
    # Use python 3
    if (sys.version_info[0] != 3):
        exit_with_error("ERROR!!! Use Python 3\n")
    # Get options
    if ("-h" in sys.argv[1]):
        print_help(0)
    elif (nArg != 4):
        print_help(1)

    startTime = time.time()
    print("{} \n".format(sys.argv), flush=True)
    print("   Start Time : {}".format(
        time.strftime("%a, %d %b %Y %H:%M:%S ", time.localtime())),
          flush=True)

    # Get args
    R0 = float(sys.argv[1])  # Average number of people infected
    incubTime = float(sys.argv[2])  # Time before symptoms present
    infectTime = float(sys.argv[3])  # Time before infectious
    deathRate = 0.01  # fraction of infected that die
    hostRate = 0.20  # Fraction that get hospitalized
    healthyTime = 8  # Time after infection that person is healthy and no longer infectious
    nDays = 100
    #sd         = 5              # Standard deviation for gaussian distribution for infection
    N = 10**3  # Number of agents

    # Initialization
    agentL = []
    for i in range(N):
        agentL.append(AGENT(ID=i))

    # Make one person infected
    agentL[0].infected = True
    agentL[0].start = 0
    yV = np.zeros([nDays])
    R0V = np.zeros([nDays])  # Rate of infection as function of time
    xV = np.asarray(range(nDays))

    # Do simulation.  Run over 100 days. I'm sure I'm screwing up my monte-carlo methods
    for day in range(nDays):
        nInfect = 0
        nTotal = 0  # number infected + number immune
        #i = 0                               # Agent index
        for agent in agentL:
            if (agent.infected == True):
                nInfect += 1  # Track number infected at this time step
                diff = day - agent.start
                if (diff > incubTime):
                    #idx = int(np.random.normal(loc=i, scale=sd))  #index of possible infection
                    idx = random.randint(0,
                                         N - 1)  #index of possible infection
                    # If picking self to infect
                    if (agentL[idx] == agent):
                        continue
                    # prob of infection = R0 / (healthyTime - infectTime
                    prob = R0 / (healthyTime - infectTime)
                    rng = random.random()
                    #print(day,rng,prob)
                    if (rng < prob and agentL[idx].immune == False
                            and agentL[idx].infected == False):
                        agentL[idx].infected = True
                        agentL[idx].start = day
                        agent.nInfect += 1
                        nInfect += 1
                if (diff > healthyTime):
                    agent.immune = True
                    agent.infected = False
                # Get R0 as function of time
                if (agent.immune == True):
                    R0V[day] += agent.nInfect
                    nTotal += 1
            #i+=1
        yV[day] = nInfect
        if (nTotal > 0):
            R0V[day] = R0V[day] / nTotal

    # Get number immune
    nImmune = 0
    meanR0 = 0
    # Visualize spatial distribution of infection
    immuneV = np.zeros([N])
    i = 0
    for agent in agentL:
        if (agent.immune == True or agent.infected == True):
            nImmune += 1
            meanR0 += agent.nInfect
            immuneV[i] = 1
        i += 1
    meanR0 = meanR0 / nImmune
    print("Number immune == {}, R0 = {:.4f} ".format(nImmune, meanR0))

    fig, ax = plt.subplots(1, 1)
    ylabel = "Number of infections"
    ax.plot(xV, yV, label=ylabel)
    ax.legend()
    plt.show()

    fig, ax = plt.subplots(1, 1)
    ylabel = "R0"
    ax.plot(xV, R0V, label=ylabel)
    ax.legend()
    plt.show()

    print("Ended : %s" % (time.strftime("%D:%H:%M:%S")))
    print("Run Time : {:.4f} h".format((time.time() - startTime) / 3600.0))

    sys.exit(0)
Exemplo n.º 27
0
def create_fastq_file(pathToFastq, desiredTransList, abundanceList, nReads,
                      readLength, transDict, transList, exonList, readType):
    """
    ARGS:
        pathToFastq      : Path to output fastq file
        desiredTransList : transcripts read from config file
        abundanceList    : list of integers that sum to a value used to normalize 
                           the number of reads. 
                                E.g. trans1 has 5 and trans2 has 10, 
                                     the ratio of trans1 : trans2 = 1 : 2
        nReads           : Rough number of desired reads, the ratios from abundanceList
                           is maintained at the expense of nReads. 
                                E.g from the above example if nReads = 10, 
                                    the actual number of reads would be 
                                    3 for trans1, 6 for trans2
        readLength       : length of reads
        transDict        : Dictionary used map transID quickly to the correct
                           transcript in transList
        transList        : List of TRANSCRIPTs. Contains sequence to pass to instance of 
                           FASTQ_READ()
        exonList         : List of EXONs. Passed to FASTQ_READ() to get metadata for each
                           fastq read. E.g. the start position and exons a read spans.
        readType         : either : single, paired-fr, paired-rf"
    RETURN:
        None. File written

    DESCRIPTION:
        Writes fastq file.

    DEBUG: 
        1. Blasted against ensembl database, spot checked a couple of transcripts.
           Need further debugging. 

           Took synthetic_sample.fastq and operated in vim on transcript ENST00000473358:
            Exons are ordered as : ENSE00001947070 ENSE00001922571 ENSE00001827679
            Copied synthetic_sample.fastq to poop.fq

            ****** in vim ******
            %s/^@Read.\{1,1000\}:start:\([0-9]\{1,100\}\):exons:\(.\{1,100\}\)\n\(^[A-Z]\{50\}\)\n^+\n\(^.\{50\}\)/\1\t\3\t\2/gc
            %s/:/\t/g   # remove colon sep exon names
            %s/"//g     # remove " around exon names

            ****** in shell, want exon reads start at (see order above) ******
            ****** Avoid grepping enties with start positions on the exon prior ******
            grep ENSE00001947070 poop.fq &> ENSE00001947070.txt
            grep ENSE00001922571 poop.fq | grep -v ENSE00001947070 &> ENSE00001922571.txt
            grep ENSE00001827679 poop.fq | grep -v ENSE00001922571 &> ENSE00001827679.txt
            awk '{print $1 "\t" $2}' ENSE00001947070.txt &> ENSE00001947070_1and2.txt
            awk '{print $1 "\t" $2}' ENSE00001922571.txt &> ENSE00001922571_1and2.txt
            awk '{print $1 "\t" $2}' ENSE00001827679.txt &> ENSE00001827679_1and2.txt
            awk '{print $2}' ENSE00001947070.txt | xargs -I{} grep -aob {} ENST00000473358.txt | 
                    awk 'BEGIN{FS=":"}{start = $1 + 29554; print start "\t" $2}' &> awk_out.txt
            awk '{print $2}' ENSE00001922571.txt | xargs -I{} grep -aob {} ENST00000473358.txt | 
                    awk 'BEGIN{FS=":"}{start = $1 + 30564 - 486; print start "\t" $2}' &> awk_out.txt
            awk '{print $2}' ENSE00001827679.txt | xargs -I{} grep -aob {} ENST00000473358.txt | 
                    awk 'BEGIN{FS=":"}{start = $1 + 30976 - 486 - 104; print start "\t" $2}' &> awk_out.txt
            Used diff to compare all the awk_out.txt to ENSE*_1and2.txt files.
                    CONCLUSION : they are identical. Therefor I get the correct start position from the
                                 correct sequences.
                    THEREFOR : I believe that create_fastq_file and FASTQ_READ() are working as expected.

        2. See debug comments of INSERT class.
           CONCLUSION : single end reads of transcripts/inserts on the '+' strand
                        in the sense direction work.

    FUTURE: 
        Include more error checking for goofy parameters, e.g. not enough reads for
        the ratios, etc.
    """
    abundanceSum = 0
    transIdx = 0
    readIdx = 0

    for abundance in abundanceList:
        abundanceSum += abundance
    #abundanceNormalization = abundanceNormalization / len(abundanceList)    # integer division
    if (abundanceSum < 1):
        exit_with_error("ERROR! abundanceSum = {}\nPlease enter abundance "
                        "values > 1\n".format(abundanceNormalization))

    if (readType == 'single'):
        pathToFastqR1 = pathToFastq + ".fq"
        fastqFileR1 = open(pathToFastqR1, "w+")
        fastqListR1 = []

    elif (readType == 'paired-fr-first' or readType == 'paired-fr-second'):
        pathToFastqR1 = pathToFastq + "-R1.fq"
        pathToFastqR2 = pathToFastq + "-R2.fq"
        fastqFileR1 = open(pathToFastqR1, "w+")
        fastqFileR2 = open(pathToFastqR2, "w+")
        fastqListR1 = []
        fastqListR2 = []
    else:
        exit_with_error("ERROR!!! Incorrect value for {}".format(readType))

    for transName in desiredTransList:
        try:
            trans = transList[transDict[transName]]
        except KeyError:
            exit_with_error("ERROR! {} is not a transcript annotated in your "
                            "gtf file\n".format(transName))

        for i in range(
                int(
                    float(abundanceList[transIdx]) / float(abundanceSum) *
                    nReads)):
            insert = create_insert(trans, readLength, 150, 15, exonList)

            if (readType == 'single'):
                fastqEntry = FASTQ_READ(Insert=insert,
                                        ReadLength=readLength,
                                        MetaData="@Read_num:%i" % (readIdx),
                                        ExonList=exonList,
                                        Direction="forward")
                fastqListR1.append(fastqEntry)

            elif (readType == 'paired-fr-first'):
                fastqEntry = FASTQ_READ(Insert=insert,
                                        ReadLength=readLength,
                                        MetaData="@Read_num:%i" % (readIdx),
                                        ExonList=exonList,
                                        Direction="reverse")
                fastqListR1.append(fastqEntry)
                fastqEntry = FASTQ_READ(Insert=insert,
                                        ReadLength=readLength,
                                        MetaData="@Read_num:%i" % (readIdx),
                                        ExonList=exonList,
                                        Direction="forward")
                fastqListR2.append(fastqEntry)
            elif (readType == 'paired-fr-second'):
                fastqEntry = FASTQ_READ(Insert=insert,
                                        ReadLength=readLength,
                                        MetaData="@Read_num:%i" % (readIdx),
                                        ExonList=exonList,
                                        Direction="forward")
                fastqListR1.append(fastqEntry)
                fastqEntry = FASTQ_READ(Insert=insert,
                                        ReadLength=readLength,
                                        MetaData="@Read_num:%i" % (readIdx),
                                        ExonList=exonList,
                                        Direction="reverse")
                fastqListR2.append(fastqEntry)

            readIdx += 1
        transIdx += 1

    if (readType == 'single'):
        for fastqEntry in fastqListR1:
            fastqFileR1.write(
                "%s\n%s\n+\n%s\n" %
                (fastqEntry.metadata, fastqEntry.seq, fastqEntry.qual))
        fastqFileR1.close()
    else:
        for fastqEntry in fastqListR1:
            fastqFileR1.write(
                "%s\n%s\n+\n%s\n" %
                (fastqEntry.metadata, fastqEntry.seq, fastqEntry.qual))

        for fastqEntry in fastqListR2:
            fastqFileR2.write(
                "%s\n%s\n+\n%s\n" %
                (fastqEntry.metadata, fastqEntry.seq, fastqEntry.qual))

        fastqFileR1.close()
        fastqFileR2.close()
Exemplo n.º 28
0
def main():
    """
    ARGS:
    RETURN:
    DESCRIPTION:
    DEBUG:
    FUTURE:
        1. Add option to fit only a specific section of data.
    """
    # Check Python version
    nArg = len(sys.argv)
    # Use python 3
    if(sys.version_info[0] != 3):
        exit_with_error("ERROR!!! Use Python 3\n")
    # Get options 
    if("-h" in sys.argv[1]):
        print_help(0)
    elif(nArg != 4 and nArg != 3):
        print_help(1)
    if(nArg == 4):
        slcIdx = int(sys.argv[3])

    startTime = time.time()
    print("{} \n".format(sys.argv),flush=True)
    print("   Start Time : {}".format(time.strftime("%a, %d %b %Y %H:%M:%S ",
                                       time.localtime())),flush=True)
    
    # Get args
    country = sys.argv[1]
    plotType = sys.argv[2]       # Straight line equals linear growth
    dataPath = "data/jhu/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
    countryFound = False
    df = pd.read_csv(dataPath)
    lastDate = df.columns[-1]
    for index, row in df.iterrows():
        # Select country specified
        if(row.values[1].lower() == country.lower()):
            if(countryFound == True):
                exit_with_error("ERROR!! {} should only occur "
                                "once".format(country.lower()))
            yV = np.asarray(row.values[4:],dtype=np.float32)  # y vector -cases
            xV = np.asarray(range(len(yV)))                   # x vector - days
            n  = len(xV)   # Number of days
            countryFound = True
   
    fig, ax = plt.subplots(1,1)
    # Generate Plot
    if(plotType == "log-lin"):
        ylabel = "ln(cases + 1)"
        print(yV)
        yV = yV + 1
        yV = np.log(yV)
        # Slice and only keep what 
        if(nArg == 4):
            if(slcIdx < 0):
                xfit = xV[slcIdx:]
                yfit = yV[slcIdx:]
            elif(slcIdx > 0):
                xfit = xV[:slcIdx]
                yfit = yV[:slcIdx]
        fit = np.polyfit(xfit,yfit,deg=1)
        # Reuse xfit, and yfit
        xfit= np.asarray([x for x in np.arange(0,n,n/100.0)])
        yfit= fit[0]*xfit + fit[1]
        ax.plot(xfit, yfit, label="Fit - y={:.3f}x+{:.3f}".format(fit[0],fit[1]))
        ax.set_title("Covid-19 in {} (ending {})".format(country, lastDate))
    elif(plotType == "lin-lin"):
        ylabel = "Covid-19_Cases"
        exit_with_error("ERROR!! I haven't handled this option yet\n")
        
    else:
        exit_with_error("ERROR!! Invalid plotType option\n")
    ax.plot(xV, yV, label=ylabel)
    ax.set_xlabel("Time spanning 0-{} days".format(n-1))
    ax.set_ylabel("{}".format(ylabel))
    ax.legend()
    plt.show()

    print("Ended : %s"%(time.strftime("%D:%H:%M:%S")))
    print("Run Time : {:.4f} h".format((time.time() - startTime)/3600.0))



    sys.exit(0)
def main():
    """
    ARGS:
    RETURN:
    DESCRIPTION:
    DEBUG:
    FUTURE:
        1. Add option to fit only a specific section of data.
    """
    # Check Python version
    nArg = len(sys.argv)
    # Use python 3
    if(sys.version_info[0] != 3):
        exit_with_error("ERROR!!! Use Python 3\n")
    # Get options 
    if(len(sys.argv) > 1 and "-h" in sys.argv[1]):
        print_help(0)
    elif(nArg != 1):
        print_help(1)

    startTime = time.time()
    print("{} \n".format(sys.argv),flush=True)
    print("   Start Time : {}".format(time.strftime("%a, %d %b %Y %H:%M:%S ",
                                       time.localtime())),flush=True)
    
    ## Get args
    kappa = 1      # =1 is Poisson, >1 is negative binomial. Average contact network density
    beta  = 0.1    # constant rate of infection
    gamma = 0.1    # average recovery rate
    mu = 3         # average number of nodes contacts
    delta = 0.1    # quarantine 
    rho=1
    R0 = kappa * beta / gamma
    nDays = 100
    #R0         = float(sys.argv[1])    # Average number of people infected
    #incubTime  = float(sys.argv[2])    # Time before symptoms present
    #infectTime = float(sys.argv[3])    # Time before infectious
    #deathRate  = 0.01           # fraction of infected that die
    #hostRate   = 0.20           # Fraction that get hospitalized
    #healthyTime= 8             # Time after infection that person is healthy and no longer infectious
    #nDays      = 100
    ##sd         = 5              # Standard deviation for gaussian distribution for infection
    N = 10**3       # Number of agents

    # Initial Conditions
    xS = 1
    xI = rho
    xD = mu * rho
    k1D = dict()
    k2D = dict()
    k3D = dict()
    k4D = dict()
    dt=1

    print("{:<6} : {:<10} {:<10} {:<10}".format("Time","xD","xS","xI"))

    for t in range(0,nDays,dt):
        print("{:<6} : {:<10.4e} {:<10.4e} {:<10.4e}".format(t,xD,xS,xI))
        ## k1
        k1D['xD'] = k1(Funct=f_D, H=dt, Beta=beta, Delta=delta, Kappa=kappa, Gamma=gamma,
                       Mu=mu, X_D=xD, X_S=xS, X_I=xI)
        k1D['xS'] = k1(Funct=f_S, H=dt, Beta=beta, Delta=delta, Kappa=kappa, Gamma=gamma,
                       Mu=mu, X_D=xD, X_S=xS, X_I=xI)
        k1D['xI'] = k1(Funct=f_I, H=dt, Beta=beta, Delta=delta, Kappa=kappa, Gamma=gamma,
                       Mu=mu, X_D=xD, X_S=xS, X_I=xI)
        ## k2
        k2D['xD'] = k2(K1D=k1D, Funct=f_D, H=dt, Beta=beta, Delta=delta, Kappa=kappa,
                       Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI)
        k2D['xS'] = k2(K1D=k1D, Funct=f_S, H=dt, Beta=beta, Delta=delta, Kappa=kappa,
                       Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI)
        k2D['xI'] = k2(K1D=k1D, Funct=f_I, H=dt, Beta=beta, Delta=delta, Kappa=kappa,
                       Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI)
        ## k3
        k3D['xD'] = k3(K2D=k2D, Funct=f_D, H=dt, Beta=beta, Delta=delta, Kappa=kappa,
                       Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI)
        k3D['xS'] = k3(K2D=k2D, Funct=f_S, H=dt, Beta=beta, Delta=delta, Kappa=kappa,
                       Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI)
        k3D['xI'] = k3(K2D=k2D, Funct=f_I, H=dt, Beta=beta, Delta=delta, Kappa=kappa,
                       Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI)
        ## k4
        k4D['xD'] = k4(K3D=k3D, Funct=f_D, H=dt, Beta=beta, Delta=delta, Kappa=kappa,
                       Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI)
        k4D['xS'] = k4(K3D=k3D, Funct=f_S, H=dt, Beta=beta, Delta=delta, Kappa=kappa,
                       Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI)
        k4D['xI'] = k4(K3D=k3D, Funct=f_I, H=dt, Beta=beta, Delta=delta, Kappa=kappa,
                       Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI)

        # Get next step value
        xD = xD + 1/6*(k1D['xD'] + 2*k2D['xD'] + 2*k3D['xD'] + k4D['xD'])
        xS = xS + 1/6*(k1D['xS'] + 2*k2D['xS'] + 2*k3D['xS'] + k4D['xS'])
        xI = xI + 1/6*(k1D['xI'] + 2*k2D['xI'] + 2*k3D['xI'] + k4D['xI'])

    print("Ended : %s"%(time.strftime("%D:%H:%M:%S")))
    print("Run Time : {:.4f} h".format((time.time() - startTime)/3600.0))



    sys.exit(0)
Exemplo n.º 30
0
def check_correct_part(eq):
    for i in range(len(eq) - 1):
        if eq[i] == '-' or eq[i] == '+':
            if not (eq[i + 1][0].isdigit() or eq[i + 1][0] == 'X'):
                exit_with_error(-5)

        elif eq[i] == '*':
            if eq[i + 1][0] != 'X':
                exit_with_error(-5)

        elif eq[i][0].isdigit():
            if eq[i].count('.') > 1:
                exit_with_error(-5)
            for n in eq[i]:
                if not (n.isdigit() or n == '.'):
                    exit_with_error(-5)

        elif eq[i][0] == 'X':
            if eq[i].count('X') > 1 or eq[i].count('^') > 1:
                exit_with_error(-5)

            if len(eq[i]) > 1:
                if eq[i][1] != '^':
                    exit_with_error(-5)
                else:
                    eq_str = re.sub('X', '', eq[i])
                    eq_str = re.sub('\^', '', eq_str)
                    for n in eq_str:
                        if not n.isdigit():
                            exit_with_error(-5)
            if not (eq[i + 1] == '+' or eq[i + 1] == '-' or eq[i + 1] == '='):
                exit_with_error(-5)

        elif eq[i] == '=':
            if not (eq[i + 1][0].isdigit() or eq[i + 1][0] == 'X'
                    or eq[i + 1] == '-'):
                exit_with_error(-5)
        else:
            exit_with_error(-5)

    if not (eq[0][0].isdigit() or eq[0][0] == 'X' or eq[0][0] == '-'):
        exit_with_error(-5)

    if not (eq[-1][-1].isdigit() or eq[-1][-1] == 'X'):
        print(eq[0][0])
        print(eq)
        exit_with_error(-4)