Exemplo n.º 1
0
def helper(hashTable, reads, infoString):
    counter, total, totalNum = 0, 0, 0

    for readFile in reads:
        if totalNum == kmerNum:
            break
        try:
            readFastq, _ = getSeqfromRead(readFile)
        except:
            continue

        hits = [
            aln for aln in referenceIdx.map(readFastq)
            if aln.q_en - aln.q_st > 0.95 * len(readFastq) and aln.strand == 1
        ]

        if infoString == "+" and len(hits) != 1:
            continue
        if infoString == "-" and len(hits) != 0:
            continue

        readSignal = getSignalFromRead(readFile)

        if len(readSignal) <= toRead:
            continue

        counter += 1

        readSignal = readSignal[fromRead:toRead]
        readString = getLevelStr(readSignal, levels)

        for i in range(len(readString)):
            if kmerNum == totalNum:
                break
            kmer = readString[i:i + k]
            totalNum += 1
            total += hashTable.get(kmer, 0)
    print(f"{infoString} k {k} l {levels} -> {total} / {totalNum}")
Exemplo n.º 2
0
    readString = getLevelString(readSignal, smoothParam, levels, overflow)
    readDict = buildDictionary(readString, kmerLength)
    return overlap(readDict, hashTable)


########################################

data = []

for filePath in posFast5:
    if posTestCases == 0:
        break

    try:
        readSeq, basecallTable = getSeqfromRead(filePath)
    except:
        continue
    if len(readSeq) < (toRead // repeatSignal):
        continue
    hits = [
        aln for aln in referenceIdx.map(readSeq)
        if aln.q_en - aln.q_st > 0.95 *
        len(readSeq) and aln.strand == 1 and aln.ctg == workingContig
    ]
    if len(hits) == 0:
        continue
    hit = hits[0]

    posTestCases -= 1
    lvlStringHits = processRead(filePath, workingContig)
Exemplo n.º 3
0
### get corresponding part of the reference using minimap2
referenceIdx = mp.Aligner(refFile)
assert referenceIdx, "failed to load/build reference index"

mod = KmerModel.load_from_hdf5(kmerModelFilePath)
posReads = getReadsInFolder(readsPosFilePath, minSize=1000000)
negReads = getReadsInFolder(readsNegFilePath, minSize=1000000)

goodK, totalK = 0, 0

dobre, zle = 0, 0

for readFile in posReads[:min(len(posReads), maxTests)]:
    #print(readFile)
    try:
        readFastq, readEvents = getSeqfromRead(readFile)
    except:
        continue
    readSeq = seqSignalCor(signalFrom, signalTo, readEvents)

    hits = [
        aln for aln in referenceIdx.map(readSeq)
        if aln.q_en - aln.q_st > 0.95 * len(readSeq)
    ]
    if len(hits) != 1:
        # print("Too many or too few hits, skipping read.")
        continue
    print(readFile)
    hit = hits[0]
    successfulReads += 1
Exemplo n.º 4
0
        levelStr = line[3]
        storeContig[line[0]] = levelStr

hashTable = {}
hashTables = {}

for contigName in storeContig.keys():
    hashTables[contigName] = {}
    buildDictionarySpecial(hashTables[contigName], storeContig[contigName], kmerLen)
    buildDictionarySpecial(hashTable, storeContig[contigName], kmerLen)

print("Preparation done!")

for sample in posReadsPaths[:400]:
    try:
        readFastq, readEvents = getSeqfromRead(sample)
    except:
        continue

    hits = [
        aln
        for aln in referenceIdx.map(readFastq)
        if aln.q_en - aln.q_st > 0.95 * len(readFastq)
        and aln.strand == 1
        and aln.ctg == workingContig
    ]
    if len(hits) != 1:
        continue
    hit = hits[0]

    refPosition = hit.r_st / len(ref[hit.ctg])
Exemplo n.º 5
0
assert referenceIdx, "failed to load/build reference index"

posReads = getReadsInFolder(readsPosFilePath, minSize=0)
negReads = getReadsInFolder(readsNegFilePath, minSize=0)

################################################################################

negHitsByRatios = [0] * len(ratios)
totalCount = 0

for readFile in negReads:
    if totalCount == readCount:
        break

    try:
        readFastq, _ = getSeqfromRead(readFile)
    except:
        continue

    hits = [aln for aln in referenceIdx.map(readFastq)]
    hits = [(hit.q_en - hit.q_st) for hit in hits]

    totalCount += 1

    for i in reversed(range(len(ratios))):
        hit = max(hits + [0])
        if hit >= ratios[i] * len(readFastq):
            negHitsByRatios[i] += 1
            # break

posHitsByRatios = [0] * len(ratios)
Exemplo n.º 6
0
################################################################################

pomery = [[[] for j in kmerLen] for i in levels]
overlap = [[[] for j in kmerLen] for i in levels]
goodDash = [[] for i in levels]
badDash = [[] for i in levels]
alignLenRead = [0 for _ in levels]
alignLenFake = [0 for _ in levels]
readCounter = 0

for posRead in posReads:
    if readCounter == readNum:
        break
    try:
        readFastq, readEvents = getSeqfromRead(posRead)
    except:
        continue
    readSeq = seqSignalCor(signalFrom, signalTo, readEvents)

    hits = [
        aln for aln in referenceIdx.map(readSeq)
        if aln.q_en - aln.q_st > 0.95 * len(readSeq)
    ]
    if len(hits) != 1:
        # print("Too many or too few hits, skipping read.")
        continue
    hit = hits[0]

    print("Working on", posRead)
    print(f"So far done {readCounter} reads")
Exemplo n.º 7
0
import numpy as np

sys.path.append("../")
from signalHelper import getReadsInFolder, getSignalFromRead, getSeqfromRead

reads = getReadsInFolder(readsFilePath, minSize=0)

signalLengths, seqLengths = [], []

for read in reads:
    if maxReads == 0:
        break

    try:
        signal = getSignalFromRead(read)
        seq, _ = getSeqfromRead(read)
    except:
        continue

    maxReads -= 1
    signalLengths.append(len(signal))
    seqLengths.append(len(seq))


meanSignal = np.mean(signalLengths)
medianSignal = np.median(signalLengths)

meanSeq = np.mean(seqLengths)
medianSeq = np.median(seqLengths)

print(f"Not found {maxReads} reads!")