예제 #1
0
def makeDividedSets():
    ctgInPart = 35
    import os
    from math import ceil
    from utils import readLines, writeLines
    from verifier import splitFullCategory

    categories = readLines(Path.categories)
    divisions = ceil(len(categories) / ctgInPart)
    categories = [categories[i * ctgInPart:(i + 1) * ctgInPart] for i in range(divisions)]

    for i, ctgList in enumerate(categories):

        pathsList = []
        for ctg in ctgList:
            category, subcategory = splitFullCategory(ctg)

            originalPath = os.path.join(Path.dataset, Constants.original, category, subcategory)
            augmentedPath = os.path.join(Path.dataset, Constants.augmented, category, subcategory)

            pathsList.extend([originalPath, augmentedPath])

        setPath = os.path.join(Path.sets, f"part_{i}")
        makeSets(pathsList, wpath=setPath, trainPart=0.9, validPart=0.05)

        writeLines(ctgList, os.path.join(setPath, "set_categories.txt"))
예제 #2
0
def processVideoFolder(folderPath=Path.rawVideos,
                       marksPath=Path.rawJson,
                       datasetPath=Path.dataset,
                       overwrite=False,
                       extension=Extensions.jpg,
                       params=None):

    processedVideos = readLines(Path.processedFiles)
    videos = [
        video for video in os.listdir(folderPath)
        if video not in processedVideos and (
            video.endswith(Extensions.mov) or video.endswith(Extensions.mp4))
    ]

    actualInfo = downloadActualInfo()

    for video in videos:
        actualizeInfoWithFrames(Path.dataset)
        filePath = os.path.join(folderPath, video)

        print(
            f"\n{Fore.GREEN}Video {filePath} is being processed {Style.RESET_ALL}"
        )
        frameVideo(filePath=filePath,
                   marksPath=marksPath,
                   datasetPath=datasetPath,
                   actualInfo=actualInfo,
                   overwrite=overwrite,
                   extension=extension,
                   params=params)

        processedVideos.append(video)

        writeLines(set(processedVideos), Path.processedFiles)
예제 #3
0
def purifySets():
    sets = {
        const.train:
        os.path.join(Path.sets, extendName(const.train, Extensions.txt)),
        const.valid:
        os.path.join(Path.sets, extendName(const.valid, Extensions.txt)),
        const.test:
        os.path.join(Path.sets, extendName(const.test, Extensions.txt)),
    }

    for set_, path in sets.items():
        files = readLines(path)
        total = len(files)
        files = [f for f in files if os.path.exists(f)]
        writeLines(files, path)

        print(f"Cleaned {total - len(files)} from {path}")
예제 #4
0
def makeCategoriesList(summarizedPath=Path.summarizedRaw,
                       allowedSubCtgList=None):
    from utils import openJsonSafely, writeLines
    from verifier import getFullCategory

    summarized = openJsonSafely(summarizedPath)

    ctgList = []
    for ctg, value in summarized.items():
        if ctg == const.maxIdx:
            continue

        for subctg in value:
            if allowedSubCtgList is not None and subctg not in allowedSubCtgList:
                continue

            idx = value[subctg][const.ctgIdx]
            ctgList.append((getFullCategory(ctg, subctg), idx))

    ctgList = [ctg for ctg, _ in sorted(ctgList, key=lambda x: x[1])]
    writeLines(ctgList, Path.categories)
예제 #5
0
def makeSets(directories,
             wpath=Path.sets,
             trainPart=0.9,
             validPart=0.05,
             ignoreOld=False,
             matchWithMarks=True):
    assert 0 < trainPart + validPart <= 1
    os.makedirs(wpath, exist_ok=True)

    testPart = 1 - trainPart - validPart

    sets = {
        const.train: {
            "path": os.path.join(wpath, extendName(const.train,
                                                   Extensions.txt)),
            "part": trainPart,
            "content": []
        },
        const.valid: {
            "path": os.path.join(wpath, extendName(const.valid,
                                                   Extensions.txt)),
            "part": validPart,
            "content": []
        },
        const.test: {
            "path": os.path.join(wpath, extendName(const.test,
                                                   Extensions.txt)),
            "part": testPart,
            "content": []
        }
    }

    inUse = []
    for set_, info in sets.items():
        info["content"] = readLines(info["path"]) if not ignoreOld else []
        inUse.extend(info["content"])

    images = []
    marks = []
    for dirIdx, path in enumerate(directories):
        print(
            "\rSearching for images and marks in listed directories, {:.1f}% has been done"
            .format(dirIdx / len(directories) * 100),
            end="")

        dirImages = [
            os.path.join(path, *img) for img in walk(
                path, targetExtensions=Extensions.images()).get("extensions")
        ]
        images.extend(dirImages)

        if matchWithMarks:
            dirMarks = [
                os.path.join(path, *mrk) for mrk in walk(
                    path, targetExtensions=Extensions.txt).get("extensions")
            ]
            marks.extend(dirMarks)

    if matchWithMarks:
        transformer = lambda x: changeExtension(x, Extensions.txt)
        print("Matching images to marks, please wait...")
        images = matchLists(master=marks,
                            slave=images,
                            transformer=transformer)

    # _, images = matchLists(master=inUse, slave=images, getMismatched=True)

    images = permutate(images)

    start = 0
    for set_, info in sets.items():
        part = info["part"]
        end = start + int(part * len(images))

        total = end - start

        info["content"].extend(images[start:end])
        info["content"] = permutate(info["content"])
        start = end

        writeLines(lines=info["content"], path=info["path"])
        print(f"\n{Fore.GREEN}Added {total} paths to {set_} {Style.RESET_ALL}")
예제 #6
0
def frameVideo(filePath,
               marksPath,
               datasetPath,
               actualInfo,
               overwrite=False,
               extension=Extensions.jpg,
               params=None,
               ctgLimit=None):

    categories = readLines(Path.categories)
    basename = extractBasename(filePath)

    try:
        jsonName = makeJSONname(basename)
        marks = json.load(open(os.path.join(marksPath, jsonName), "r"))
    except:
        print(
            f"{Fore.RED}There is no json file {marksPath} for {filePath} {Style.RESET_ALL}"
        )
        return

    framesGenerator = generateFrames(filePath)
    offset = getKeysOffset(marks.keys())
    marksSeparated = {}
    total = 0
    for idx, frame in enumerate(framesGenerator):
        # if idx == 20:
        #     break

        frameMarks = getFrameMarks(idx, marks, offset)
        if not frameMarks:
            continue

        category = frameMarks[const.category]
        subcategory = frameMarks[const.subcategory]

        countKeys = [const.original, category, subcategory]
        if idx == 0:
            globalIdx = getNested(dictionary=actualInfo,
                                  keys=countKeys,
                                  default=0)

        localIdx = idx + globalIdx
        if ctgLimit is not None and localIdx == ctgLimit:
            break

        frameID = f"frame_{localIdx}"
        fullCategory = getFullCategory(category, subcategory)

        if fullCategory not in categories:
            categories.append(fullCategory)

        ctgIdx = categories.index(fullCategory)
        frameName = f"{fullCategory}{const.separator}{frameID}{const.separator}{const.original}"

        dirPath = os.path.join(datasetPath, const.original, category,
                               subcategory)
        framesPath = os.path.join(dirPath, const.frames)
        framePath = os.path.join(framesPath, extendName(frameName, extension))

        updateNested(dictionary=actualInfo, keys=countKeys, value=1)
        if not overwrite and os.path.exists(framePath):
            print("\rFrame #{} has been passed".format(idx), end="")
            continue

        os.makedirs(framesPath, exist_ok=True)

        frameInfo = {
            const.image: extendName(frameName, extension),
            const.coords: fitCoords(frameMarks[const.coords], frame.shape[:2]),
            const.fullCategory: fullCategory,
            const.ctgIdx: ctgIdx,
            const.imageShape: frame.shape[:2]
        }

        keySet = countKeys + [
            frameName
        ]  # ["original", category, subcategory, frameName]
        putNested(dictionary=marksSeparated, keys=keySet, value=frameInfo)

        cv2.imwrite(framePath, frame, params)
        total += 1

        print("\rFrame #{} has been added".format(idx), end="")

    marksSeparated = marksSeparated[const.original]
    print()
    for ctg, value in marksSeparated.items():
        for subctg, subctgMarks in value.items():
            subctgMarksJson = os.path.join(
                datasetPath, const.original, ctg, subctg,
                extendName(const.marks, Extensions.json))

            oldMarks = openJsonSafely(subctgMarksJson)
            for k, v in subctgMarks.items():
                oldMarks[k] = v

            json.dump(oldMarks, open(subctgMarksJson, "w"), indent=3)

            print(
                f"{Fore.GREEN}Added marks to {subctgMarksJson} {Style.RESET_ALL}"
            )

    writeLines(categories, Path.categories)
    print(
        f"{Fore.GREEN}Updated categories file {Path.categories} {Style.RESET_ALL}"
    )
    print(f"{Fore.GREEN}Added {total} frames in total {Style.RESET_ALL}")
        cms[db] = cm2(db)
    except:
        print 'OOPS!'
        continue
#cms = { db:cm2(db) for db in cmdbs }
    
for iii in xrange( 0,len(clusters) ):
    ##df = clust_dfs.ix[iii]  # dont need for this?
    if len(clusters[iii]) < 10:
        break
    out = get_motif_cluster_sites( iii, force=False )
    if out['record'] is None:
        continue
    try:
        pdf_data = plot_motif_from_sites( out['sites'], 'pdf', out['smallText'] )
        ut.writeLines( pdf_data.split( '\n' ), 'motif_clusters_%s/%04d.pdf'%(param_I_str,iii) )
        ut.writeLines( out['memeOut'].split( '\n' ), 'motif_clusters_%s/%04d_memeOut.txt'%(param_I_str,iii) )
    except:
        print 'ERROR in generating PDF for motif cluster.'
    print iii, 'DONE'
        
#os.popen( 'pdftk motif_clusters_%s/????.pdf cat output motif_clusters_%s/ALL.pdf'%(param_I_str,param_I_str) ).read()
try:
    os.popen( ('gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=motif_clusters_%s/ALL.pdf '
               'motif_clusters_%s/????.pdf') % (param_I_str,param_I_str) ).read()
    os.popen( 'pdfnup --landscape --suffix nup --nup 5x6 motif_clusters_%s/ALL.pdf'%(param_I_str) ).read()
except:
    print 'gs or pdfnup not available!'

#     widths = np.array( [ pssm.shape[0] for pssm in all_pssms.values() ] )
#     max_width = np.max( widths )
예제 #8
0
        cms[db] = cm2(db)
    except:
        print 'OOPS!'
        continue
#cms = { db:cm2(db) for db in cmdbs }
    
for iii in xrange( 0,len(clusters) ):
    ##df = clust_dfs.ix[iii]  # dont need for this?
    if len(clusters[iii]) < 10:
        break
    out = get_motif_cluster_sites( iii, force=False )
    if out['record'] is None:
        continue
    try:
        pdf_data = plot_motif_from_sites( out['sites'], 'pdf', out['smallText'] )
        ut.writeLines( pdf_data.split( '\n' ), 'motif_clusters_%s/%04d.pdf'%(param_I_str,iii) )
        ut.writeLines( out['memeOut'].split( '\n' ), 'motif_clusters_%s/%04d_memeOut.txt'%(param_I_str,iii) )
    except:
        print 'ERROR in generating PDF for motif cluster.'
    print iii, 'DONE'
        
#os.popen( 'pdftk motif_clusters_%s/????.pdf cat output motif_clusters_%s/ALL.pdf'%(param_I_str,param_I_str) ).read()
try:
    os.popen( ('gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=motif_clusters_%s/ALL.pdf '
               'motif_clusters_%s/????.pdf') % (param_I_str,param_I_str) ).read()
    os.popen( 'pdfnup --landscape --suffix nup --nup 5x6 motif_clusters_%s/ALL.pdf'%(param_I_str) ).read()
except:
    print 'gs or pdfnup not available!'

#     widths = np.array( [ pssm.shape[0] for pssm in all_pssms.values() ] )
#     max_width = np.max( widths )