예제 #1
0
def main():
    # check that everything has been installed correctly
    root = os.path.dirname(os.path.abspath(__file__))
    root += '/'

    #Get the parameters
    args, parser = get_params()
    #set the required variables
    debug = args.debug
    # output dir
    outDir = '{:s}/mmseqs2_src/'.format(os.path.realpath(
        args.output_directory))

    # Make some checks before extractic the package
    # check that GCC is installed
    gccVer, gccOk = check_gcc()
    if not gccOk:
        sys.stderr.write(
            '\nERROR: no GCC compiler was found in your system.\n')
        sys.stderr.write('Please, go to https://gcc.gnu.org\n')
        sys.stderr.write(
            'And follow the instructions to install the latest version of GCC in your system.\n'
        )
        sys.exit(-5)
    # check that cmake is installed
    cmake3Ver, cmake3Ok = check_cmake3()
    cmakeVer, cmakeOk = check_cmake()

    # if None of the versions is installed then exit with an error
    if not (cmake3Ok or cmakeOk):
        print(
            'ERROR: you must install cmake version 3.10 or above before continuing'
        )
        sys.stderr.write('Please, go to https://cmake.org\n')
        sys.stderr.write(
            'And follow the instructions on how to install the latest version of cmake in your system.\n'
        )
        sys.exit(-5)

    # path to the source package
    tarPath = os.path.join(root, 'mmseqs2_src/mmseqs.tar.gz')

    # skip the extration if the directory already exists
    if os.path.isdir(outDir):
        print('WARNING: the directory\n{:s}'.format(outDir))
        print('already exists, if you want to extract the package,')
        print('please remove the above-mentioned directory.')
        print('\nEXIT: no file was extracted.')
        sys.exit(-2)

    # create the directory if it does not exist
    systools.makedir(outDir)

    #if debug:
    #    print('The source for MMseqs2 will be stored in\n{:s}'.format(outDir))
    systools.untar(tarPath, outDir=outDir, debug=debug)
    # create the build Directory
    systools.makedir(os.path.join(outDir, 'build/'))
예제 #2
0
def mmseqs_createindex(dbPath, debug=False):
    """Create a index from a mmseq2 database file."""
    if debug:
        print('mmseqs_createindex :: START')
        print('Input mmseqs2 db file:\t%s' % dbPath)
    #check that the database file exist
    if not os.path.isfile(dbPath):
        sys.stderr.write(
            'The file %s was not found, please provide the path to a mmseqs2 database file'
            % dbPath)
        sys.exit(-2)
    #''' USE IN FUTURE VERSION OF CREATEINDEX
    #print(dbPath)
    tmpBname = os.path.basename(dbPath)
    tmpDir = '{:s}/tmp_{:s}/'.format(
        os.path.dirname(dbPath), os.path.basename(tmpBname.split('.', 1)[0]))
    systools.makedir(tmpDir)
    makeIdxCmd = '{:s} createindex {:s} {:s} --threads 2 -v 0'.format(
        get_mmseqs_path(), dbPath, tmpDir)
    #'''
    # command to be executed
    # EXAMPLE; mmseqs createindex in.mmseqs2_db
    #makeIdxCmd = '{:s} createindex {:s} -v 0'.format(get_mmseqs_path(), dbPath)
    if debug:
        print('mmseqs2 createindex CMD:\t%s' % makeIdxCmd)
    #execute the system call
    process = subprocess.Popen(makeIdxCmd,
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    stdout_val, stderr_val = process.communicate()  #get stdout and stderr
    process.wait()
    if debug:
        print('STDOUT:\n%s\n' % stdout_val)
        print('STDERR:\n%s\n' % stderr_val)
    # make sure that the 3 idx files have been properly created
    idx1 = '%s.sk6' % dbPath
    if not os.path.isfile(idx1):
        sys.stderr.write('The MMseqs2 index file %s could not be created.' %
                         idx1)
        sys.exit(-2)
    idx2 = '%s.sk6.index' % dbPath
    if not os.path.isfile(idx2):
        sys.stderr.write(
            '\nWARNING: The MMseqs2 index file %s could not be created.' %
            idx2)
        #sys.exit(-2)
    # remove the temporary directory
    shutil.rmtree(path=tmpDir)
    # return a output tuple
    return (stdout_val, stderr_val, makeIdxCmd, idx1, idx2)
예제 #3
0
def mmseqs_createdb(inSeq, outDir=os.getcwd(), debug=False):
    """Create a database file for mmseqs2 from the input sequence file."""
    if debug:
        print('mmseqs_createdb :: START')
        print('Input FASTA file:\t%s' % inSeq)
        print('Outdir:\t%s' % outDir)
    #check that the input file and the database exist
    if not os.path.isfile(inSeq):
        sys.stderr.write(
            'The file %s was not found, please provide the path to a valid FASTA file'
            % inSeq)
        sys.exit(-2)
    #check if the database exists
    if outDir[-1] != '/':
        outDir += '/'
    # create dir if not already exists
    systools.makedir(outDir)
    # check the set db name
    dbName = os.path.basename(inSeq)
    dbName = dbName.split('.')[0]  # take the left part of the file name
    dbName = '%s.mmseqs2db' % dbName
    dbPath = '%s%s' % (outDir, dbName)
    # command to be executed
    # EXAMPLE; mmseqs createdb in.fasta /outdir/mydb
    makeDbCmd = '%s createdb %s %s -v 0' % (get_mmseqs_path(), inSeq, dbPath)
    if debug:
        print('mmseqs2 createdb CMD:\t%s' % makeDbCmd)
    #execute the system call
    process = subprocess.Popen(makeDbCmd,
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    stdout_val, stderr_val = process.communicate()  #get stdout and stderr
    process.wait()
    if debug:
        print('STDOUT:\n%s\n' % stdout_val)
        print('STDERR:\n%s\n' % stderr_val)
    #return a tuple with the results
    return (stdout_val, stderr_val, makeDbCmd, dbPath)
예제 #4
0
def main():

    # check that everything has been installed correctly
    root = os.path.dirname(os.path.abspath(__file__))
    root += '/'

    #Get the parameters
    args, parser = get_params()
    #set the required variables
    debug = args.debug
    # output dir
    outDir = '{:s}/sonicparanoid_test/'.format(
        os.path.realpath(args.output_directory))

    # path to the source package
    testSrcDir = os.path.join(root, 'example/')

    # skip the extration if the directory already exists
    if os.path.isdir(outDir):
        print('WARNING: the directory\n{:s}'.format(outDir))
        print('already exists, if you want to extract the package,')
        print('please remove the above-mentioned directory.')
        print('\nEXIT: no file copied.')
        sys.exit(-2)
    # create the directory if it does not exist
    systools.makedir(outDir)

    # copy the test files
    print(outDir)
    copytree(testSrcDir, outDir, symlinks=False, ignore=None)
    #copytree(testSrcDir, outDir)
    if os.path.isdir(outDir):
        print('INFO: all test files were succesfully copied to\n{:s}\n'.format(
            outDir))
    # suggest the command to run
    print('Go inside the directory\n{:s}\nand type\n'.format(outDir))
    print('sonicparanoid -i ./test_input -o ./test_output -m fast -t 4')
예제 #5
0
def main():

    # check that everything has been installed correctly
    root = os.path.dirname(os.path.abspath(__file__))
    root += '/'

    # get gcc version
    ##gccVer, gccOk = check_gcc()
    # compile binaries for multi-species orthology if required
    '''
    check_quick_multiparanoid_installation(gccVer, gccOk)
    check_quick_multiparanoid_installation(gccVer, gccOk)
    '''
    #### FORCE IT TO OK
    check_quick_multiparanoid_installation(
        'FORCED TO TRUE: Sytem specific GCC version', True)
    ####
    # check MMseqs2 installation
    # check_mmseqs_installation(root)

    # start measuring the execution time
    ex_start = time.perf_counter()
    #Get the parameters
    args, parser = get_params()
    #start setting the needed variables
    debug = args.debug
    inDir = None
    if args.input_directory is not None:
        inDir = '%s/' % os.path.realpath(args.input_directory)
    # output dir
    outDir = '%s/' % os.path.realpath(args.output_directory)

    # create input directory inside the output directory
    tmpInDir = os.path.join(outDir, 'input/')
    # if it already exists make sure all files in it are removed
    if os.path.isdir(tmpInDir):
        for el in os.listdir(tmpInDir):
            os.remove(os.path.join(tmpInDir, el))
    else:
        systools.makedir(tmpInDir)
    # move input files to the temporary input directory
    inProtCnt = 0
    for f in os.listdir(inDir):
        if f.startswith('.'):
            continue
        tmpPath = os.path.join(inDir, f)
        inProtCnt += 1
        if os.path.isfile(tmpPath):
            systools.copy(tmpPath, tmpInDir, debug=debug)
    # INPUT CHECK
    inDir = tmpInDir
    # check integrity of input files
    inputModList = check_input_files(inDir, debug=debug)
    # rename the input files if where required
    for tpl in inputModList:
        newPath, newSymbol = tpl
        print(newPath)
        oldPath = newPath.replace('_no_blanks', '')
        # remove the old file and rename the one with valid headers
        if os.path.isfile(oldPath):
            print(oldPath)
            os.remove(oldPath)
            systools.move(newPath, oldPath, debug=debug)
    if debug:
        print('Input files which FASTA header was modified:\t%d' %
              len(inputModList))
    # Optional directories setup
    alignDir = None
    if args.shared_directory is not None:
        alignDir = '%s/' % os.path.realpath(args.shared_directory)
    else:
        alignDir = os.path.join(outDir, 'alignments/')
        systools.makedir(alignDir)
    dbDirectory = None
    if args.mmseqs_dbs is not None:
        dbDirectory = '%s/' % os.path.realpath(args.mmseqs_dbs)
    else:
        dbDirectory = '%smmseqs2_databases/' % outDir
    cpus = args.threads
    #coff = args.cutoff
    coff = 40
    owOrthoTbls = args.overwrite_tables
    multiOnly = args.multi_species_only
    skipMulti = args.skip_multi_species
    runMode = args.mode
    maxGenePerSp = args.max_gene_per_sp

    # set the sensitivity value for MMseqs2
    sensitivity = 4.0  # default
    if runMode == 'sensitive':
        sensitivity = 6.0
    elif runMode == 'fast':
        sensitivity = 2.5
    elif runMode == 'most-sensitive':
        sensitivity = 7.5
    overwrite = args.overwrite
    if overwrite:
        owOrthoTbls = True

    # set sensitivity using a user spcified value if needed
    if args.sensitivity:
        if 1. <= args.sensitivity <= 7.5:
            sensitivity = round(args.sensitivity, 1)
            print(
                'WARNING: the run mode \'%s\' will be overwritten by the custom MMseqs sensitivity value of %s.\n'
                % (runMode, str(args.sensitivity)))
        else:
            sys.stderr.write(
                '\nERROR: the sensitivity parameter must have a value between 1.0 and 7.5\n'
            )

    # set the maximum length difference allowed if difference from default
    if args.max_len_diff != 0.5:
        if not (0. <= args.max_len_diff <= 1.0):
            sys.stderr.write(
                '\nERROR: the legth difference ratio must have a value between 0 and 1.\n'
            )

    # set the variable to control the creation of orthologous pairs
    output_relations = args.output_pairs
    if args.qfo_2011:
        output_relations = True

    updateId = None
    if args.update is not None:
        updateId = args.update
    #check that mutually exclusive options are not selected
    if multiOnly and skipMulti:
        sys.stderr.write(
            '\nERROR: you cannot select the options --skip-multi-species and --multi-species-only at the same time.\n'
        )
        sys.exit(-5)
    # set the variable for MMseqs2 database indexing
    idx_dbs = True
    if args.no_indexing:
        idx_dbs = False
    # name for multispecies groups
    multiSpeciesClstrNameAll = 'multispecies_clusters.tsv'
    print('\nSonicParanoid will be executed with the following parameters:')
    print('Input directory:\t{:s}'.format(inDir))
    print('Input proteomes:\t{:d}'.format(inProtCnt))
    print('Output directory:\t{:s}'.format(outDir))
    print('Alignments directory:\t{:s}'.format(alignDir))
    print('Create pre-filter indexes:\t{:s}'.format(str(idx_dbs)))
    print('Complete overwrite:\t{:s}'.format(str(overwrite)))
    print('Re-create ortholog tables:\t{:s}'.format(str(owOrthoTbls)))
    print('CPUs:\t{:d}'.format(cpus))
    print('Run mode:\t%s (MMseqs2 s=%s)' % (runMode, str(sensitivity)))

    # check that the input directory has been provided
    if (inDir is None):
        sys.stderr.write('\nERROR: no input species.\n')
        parser.print_help()

    # Check if the run already exists
    spFile = pairsFile = None
    # SUGGEST THE USER TO USE THE UPDATE FEATURE
    spFile = os.path.join(outDir, 'species.txt')
    # check if it is an update or not
    if os.path.isfile(spFile) and (not owOrthoTbls):
        # fill a set with the new species names
        newfSet = set()
        flist = os.listdir(inDir)
        for f in flist:
            if f.startswith('.DS_'):
                continue
            newfSet.add(f)
        # now create a set from old species list
        oldfSet = set()
        for ln in open(spFile):
            oldfSet.add(ln.rstrip('\n'))
        # make the union of the two sets
        unionSet = oldfSet.union(newfSet)
        # if the length is different than suggest the user to use
        # the update or overwrite option
        if len(unionSet) != oldfSet:
            if updateId is None:
                sys.stderr.write(
                    '\n\nThe file with species already exists, but the new species list is different from the existing one.'
                )
                sys.stderr.write(
                    '\nThis suggests that you have added, or removed species from the input directory.'
                )
                sys.stderr.write(
                    '\nPlease consider using the \'--update update_name\' option.'
                )
                sys.stderr.write(
                    '\nAlternatively you could completely overwrite a previous run using the \'--overwrite\' option.\n'
                )
                sys.exit(-6)
    else:
        spFile = None

    # start the processing
    update_run = False

    if updateId is None:
        if multiOnly:  #skip everything and execute directly multi-species clustering
            spFile = '%sspecies.txt' % outDir
            pairsFile = '%sspecies_pairs.txt' % outDir
        else:
            spFile, pairsFile = orthodetect.run_sonicparanoid2_multiproc(
                inDir,
                outDir=outDir,
                threads=cpus,
                sharedDir=alignDir,
                mmseqsDbDir=dbDirectory,
                create_idx=idx_dbs,
                sensitivity=sensitivity,
                cutoff=coff,
                confCutoff=0.05,
                lenDiffThr=args.max_len_diff,
                overwrite_all=overwrite,
                overwrite_tbls=owOrthoTbls,
                update_run=update_run,
                keepAlign=args.keep_raw_alignments,
                debug=debug)
        #run multi-species clustering
        if not skipMulti:
            #copy sqltables
            multiOutDir = os.path.join(outDir, 'multi_species/')
            sqlPaths = orthodetect.fetch_sql_files(rootDir=outDir,
                                                   outDir=multiOutDir,
                                                   pairsFile=pairsFile,
                                                   coreOnly=False,
                                                   debug=debug)
            print('Ortholog tables loaded for multi-species orthology:\t%d' %
                  len(sqlPaths))
            sys.stdout.write('\nCreating ortholog groups...')
            multisp_start = time.perf_counter()
            quickparaRoot = orthodetect.get_quick_multiparanoid_src_dir()
            #create the multi-species clusters
            orthodetect.run_quickparanoid(sqlTblDir=multiOutDir,
                                          outDir=multiOutDir,
                                          srcDir=quickparaRoot,
                                          outName=multiSpeciesClstrNameAll,
                                          speciesFile=spFile,
                                          maxGenePerSp=maxGenePerSp,
                                          debug=debug)
            sys.stdout.write(
                'Ortholog groups creation elapsed time (seconds):\t{:s}\n'.
                format(str(round(time.perf_counter() - multisp_start, 3))))

        # output directory for stats
        relDict = '%sortholog_relations/' % outDir
        # calculate stats on the generated clusters
        #orthodetect.calc_ortholog_group_stats(rootDir=outDir, outDir=relDict, outName=None, pairsFile=pairsFile, debug=debug)
        # ALL
        orthoRelName = 'ortholog_pairs.tsv'
        if args.qfo_2011:
            orthoRelName = 'ortholog_pairs_benchmark.tsv'
        # generate the relations
        if output_relations:
            orthodetect.extract_ortholog_pairs(rootDir=outDir,
                                               outDir=relDict,
                                               outName=orthoRelName,
                                               pairsFile=pairsFile,
                                               coreOnly=False,
                                               splitMode=args.qfo_2011,
                                               debug=debug)
    else:  #Update the database
        update_run = True
        spFile, pairsFile = orthodetect.run_sonicparanoid2_multiproc(
            inDir,
            outDir=outDir,
            threads=cpus,
            sharedDir=alignDir,
            mmseqsDbDir=dbDirectory,
            create_idx=idx_dbs,
            sensitivity=sensitivity,
            cutoff=coff,
            confCutoff=0.05,
            lenDiffThr=args.max_len_diff,
            overwrite_all=overwrite,
            overwrite_tbls=owOrthoTbls,
            update_run=update_run,
            keepAlign=args.keep_raw_alignments,
            debug=debug)
        #run multi-species clustering
        if not skipMulti:
            #copy sqltables
            multiOutDir = os.path.join(outDir, '{:s}/'.format(updateId))
            sqlPaths = orthodetect.fetch_sql_files(rootDir=outDir,
                                                   outDir=multiOutDir,
                                                   pairsFile=pairsFile,
                                                   coreOnly=False,
                                                   debug=debug)
            print('Ortholog tables loaded for multi-species orthology:\t%d' %
                  len(sqlPaths))
            sys.stdout.write('\nCreating ortholog groups...')
            multisp_start = time.perf_counter()
            quickparaRoot = orthodetect.get_quick_multiparanoid_src_dir()
            #create the multi-species clusters
            orthodetect.run_quickparanoid(sqlTblDir=multiOutDir,
                                          outDir=multiOutDir,
                                          srcDir=quickparaRoot,
                                          outName=multiSpeciesClstrNameAll,
                                          speciesFile=spFile,
                                          maxGenePerSp=maxGenePerSp,
                                          debug=debug)
            sys.stdout.write(
                'Ortholog groups creation elapsed time (seconds):\t{:s}\n'.
                format(str(round(time.perf_counter() - multisp_start, 3))))
        # extract ortholog pairs
        relDict = '%sortholog_relations/' % outDir
        orthoRelName = '{:s}_relations.tsv'.format(updateId)
        if args.qfo_2011:
            orthoRelName = '{:s}_relations_benchmark.tsv'.format(updateId)
        if output_relations:
            orthodetect.extract_ortholog_pairs(rootDir=outDir,
                                               outDir=relDict,
                                               outName=orthoRelName,
                                               pairsFile=pairsFile,
                                               coreOnly=False,
                                               splitMode=args.qfo_2011,
                                               debug=debug)

    ex_end = round(time.perf_counter() - ex_start, 3)
    sys.stdout.write(
        '\nTotal elapsed time (seconds):\t{:0.3f}\n'.format(ex_end))

    # remove not required files
    cleanup(rootDir=outDir, debug=debug)
예제 #6
0
def process_multisp_tbl(inTbl: str, outPath: str, debug: bool = False) -> None:
    """Check consistency of table with ortholog groups and extract main stats."""
    if debug:
        print('process_multisp_tbl :: START')
        print('Input ortholog groups table:\t{:s}'.format(inTbl))
        print('Output stats file:\t{:s}'.format(outPath))
    #check that the input directory is valid
    if not os.path.isfile(inTbl):
        sys.stderr.write(
            '\nERROR (file not found): you must provide a valid path to the text file containig the ortholog groups table generated using SonicParanoid.\n'
        )
        sys.exit(-2)

    # create the directory that will contain the output file if required
    systools.makedir(os.path.dirname(outPath))

    # start processing the ortholog groups
    fd = open(inTbl, 'r')
    # extract the head and check rthe validity of the input file
    hdr_columns: List[str] = fd.readline().rstrip('\n').split('\t')
    # check the hdr
    if not hdr_columns[0] == 'group_id':
        sys.stderr.write('\nERROR: the header is not a valid.\n')
        sys.exit(
            'Make sure that the ortholog groups file was generated using SonicParanoid.'
        )
    spCntStr: str = str(len(hdr_columns[4:-1]) / 2)
    spCntStr = spCntStr.strip()
    # check that the number of species is valid, for example not column was removed from the file
    # in thise case the diction must give a float with ending with '.0'
    if not spCntStr.endswith('.0'):
        sys.stderr.write(
            '\nERROR : there is a problem with the number of species found in the table.\nMake sure you did not manually remove any column from the original output.\n'
        )
        sys.exit(-2)
    # convert the string to int
    spCnt: int = int(spCntStr.split('.', 1)[0])
    # variables to store the counts
    totCnt: int = 0
    allSpCnt: int = 0
    twoSpCnt: int = 0
    mostSeedsId: str
    maxSeedsCnt: int = 0
    # start looping through the clusters
    for clstr in fd:
        flds: List[str] = clstr.rstrip('\n').split('\t')
        totCnt += 1
        clstrId: str = flds[0]
        # check if it contains all species
        if int(flds[2]) == spCnt:
            allSpCnt += 1
        elif int(flds[2]) == 2:
            twoSpCnt += 1
        # find the cluster with the high amount of orthologs with confidence 1.0
        seedsCnt = int(flds[3])
        if seedsCnt > maxSeedsCnt:
            maxSeedsCnt = seedsCnt
            mostSeedsId = clstrId
    fd.close()
    # variables with allSp pct
    allSpPct: float = round(float(allSpCnt / totCnt) * 100., 2)
    twoSpPct: float = round(float(twoSpCnt / totCnt) * 100., 2)

    # open the output file
    ofd = open(outPath, 'w')
    ofd.write('Stats for the ortholog groups file:\n{:s}\n'.format(inTbl))
    ofd.write('\nClusters:\t{:d}'.format(totCnt))
    ofd.write('\nSpecies:\t{:d}'.format(spCnt))
    ofd.write('\nClusters with all species:\t{:d}'.format(allSpCnt))
    ofd.write('\nPercentage of clusters with all species:\t{:10.2f}'.format(
        allSpPct))
    ofd.write('\nClusters with two species:\t{:d}'.format(twoSpCnt))
    ofd.write('\nPercentage of clusters with two species:\t{:10.2f}'.format(
        twoSpPct))
    ofd.write('\nCluster with highest number of main orthologs:\t{:s}'.format(
        mostSeedsId))
    ofd.close()
예제 #7
0
def extract_fasta(clstrDict: Dict[str, Dict[str, List[str]]],
                  fastaDir: str,
                  outDir: str,
                  multiFasta: bool = False,
                  annotationDict: Dict[str, List[List[str]]] = {},
                  debug: bool = False) -> None:
    """Extract FASTA sequences for echa cluster."""
    if debug:
        print('\nextract_fasta :: START')
        print('Cluster for which sequences will be extracted:\t{:d}'.format(
            len(clstrDict)))
        print('Directory with the species files: {:s}'.format(fastaDir))
        print('Output directory: {:s}'.format(outDir))
        print('Output multiple FASTA files: {:s}'.format(str(multiFasta)))
        print('Length of annotation dictionary: {:d}'.format(
            len(annotationDict)))

    annotate: bool = False
    if len(annotationDict) > 0:
        annotate = True

    # check the directory with the fasta files exist
    if not os.path.isdir(fastaDir):
        sys.stderr.write(
            '\nERROR (file not found): you must provide a valid path to the directory containig the species files.\n'
        )
        sys.exit(-2)
    else:  # make sure it is not empty
        tmpList: List[str] = os.listdir(fastaDir)
        if len(tmpList) < 2:
            sys.stderr.write(
                '\nERROR: the directory containig the species files must contain at least two FASTA files.\n'
            )
            sys.exit(-5)

    # will contain the species names that are actually required
    requSpDict: Dict[str, str] = {}

    # create the list with required species files
    for clstr, sp2geneDict in clstrDict.items():
        for sp, orthoList in sp2geneDict.items():
            # only process if required
            if sp in requSpDict:
                continue
            else:
                # make sure there is at least one ortholog for the current species
                if len(orthoList) == 1:  # it could be empty
                    if orthoList[0][0] == '*':  # then it is empty
                        continue
            # add the specis to the dictionary
            tmpPath: str = os.path.join(fastaDir, sp)
            requSpDict[sp] = tmpPath
            if not os.path.isfile(tmpPath):
                sys.stderr.write(
                    '\nERROR (file not found): the species file for {:s} was not found at\n{:s}\nplease provide a valid path.\n'
                    .format(sp, tmpPath))
                sys.exit(-2)

    # load all the sequences in a dictionary
    # example, tvaginalis -> geneXAB -> ATGTAGGTA
    seqsDict: Dict[str, Dict[str, str]] = {}
    for spFile, fastaPath in requSpDict.items():
        spName: str = os.path.basename(spFile)
        seqsDict[spName] = load_seqs_in_dict(fastaPath=fastaPath, debug=debug)

    # queue to contain the paths of each single
    tmpDq: Deque[str]
    # now generate the output files
    # separated in directories separated by cluster id
    # and each file named clustersId-species_name
    for clstr, sp2geneDict in clstrDict.items():
        # create the output directory
        tmpClstrDir: str = os.path.join(outDir, 'clstr{:s}/'.format(clstr))
        systools.makedir(tmpClstrDir)
        # now for each species extract the sequences
        if multiFasta:  #write one fasta file for each species
            for sp, orthoList in sp2geneDict.items():
                # skip the creation of files if the cluster is empty
                if len(orthoList) == 1:
                    if orthoList[0][0] == '*':
                        continue
                tmpFastaName = 'clstr{:s}-{:s}.fasta'.format(clstr, sp)
                tmpOutPath = os.path.join(tmpClstrDir, tmpFastaName)
                ofd = open(tmpOutPath, 'w')
                # write the sequences
                for ortho in orthoList:
                    if annotate:
                        # create the header by merging the annotations
                        newHdr: str
                        if ortho in annotationDict:  # sometimes no annotation is found!
                            annotLists = annotationDict[ortho]
                            newHdr = '|'.join(
                                [';'.join(l) for l in annotLists])
                            ofd.write('>{:s}\n'.format(newHdr))
                        else:
                            ofd.write('>{:s}\n'.format(ortho))
                    else:
                        ofd.write('>{:s}\n'.format(ortho))
                    # write the sequence
                    ofd.write('{:s}\n'.format(str(seqsDict[sp][ortho])))
                ofd.close()
        else:  #write a single FASTA file
            tmpFastaName = 'clstr{:s}.fasta'.format(clstr)
            tmpOutPath = os.path.join(tmpClstrDir, tmpFastaName)
            ofd = open(tmpOutPath, 'w')
            for sp, orthoList in sp2geneDict.items():
                # skip the creation of files if the cluster is empty
                if len(orthoList) == 1:
                    if orthoList[0][0] == '*':
                        continue
                # write the sequences
                for ortho in orthoList:
                    if annotate:
                        # create the header by merging the annotations
                        newHdr: str
                        if ortho in annotationDict:  # sometimes no annotation is found!
                            annotLists = annotationDict[ortho]
                            newHdr = '|'.join(
                                [';'.join(l) for l in annotLists])
                            ofd.write('>{:s}\n'.format(newHdr))
                        else:
                            ofd.write('>{:s}\n'.format(ortho))
                    else:
                        ofd.write('>{:s}\n'.format(ortho))
                    # write the sequence
                    ofd.write('{:s}\n'.format(str(seqsDict[sp][ortho])))
            ofd.close()
예제 #8
0
def extract_by_sp_cnt(inTbl: str,
                      min: int = 2,
                      max: int = 2,
                      outDir: str = os.getcwd(),
                      minConf: float = 0.1,
                      debug: bool = False) -> Dict[str, Dict[str, List[str]]]:
    """Extract clusters based on on the number of species of which they are composed."""
    if debug:
        print('\nextract_by_sp_cnt :: START')
        print('Input groups table:\t{:s}'.format(inTbl))
        print('Minimum number of species in cluster:\t{:d}'.format(min))
        print('Maximum number of species in cluster:\t{:d}'.format(max))
        print('Output directory: {:s}'.format(outDir))
        print('Minimum confidence for orthologs:\t{:.2f}'.format(minConf))
    #check that the input directory is valid
    if not os.path.isfile(inTbl):
        sys.stderr.write(
            '\nERROR (file not found): you must provide a valid path to the text file containig the ortholog groups table generated using SonicParanoid.\n'
        )
        sys.exit(-2)

    # check the minimum confidence value
    if not (0.05 <= minConf <= 1.):
        sys.stderr.write(
            '\nWARNING: the ortholog confidence threshold must be set to a value between 0.05 and 1.0.\n'
        )
        sys.stderr.write('It will now be set to 0.1.\n')
        min = max
    # start processing the ortholog groups
    fd = open(inTbl, 'r')
    # extract the head and check rthe validity of the input file
    hdr_columns: List[str] = fd.readline().rstrip('\n').split('\t')
    # check the hdr
    if not hdr_columns[0] == 'group_id':
        sys.stderr.write('\nERROR: {:s}\nis not a valid header.\n')
        sys.exit(
            'Make sure that the ortholog groups file was generated using SonicParanoid.'
        )
    spCntStr: str = str(len(hdr_columns[4:-1]) / 2)
    spCntStr = spCntStr.strip()
    # check that the number of species is valid, for example not column was removed from the file
    # in thise case the diction must give a float with ending with '.0'
    if not spCntStr.endswith('.0'):
        sys.stderr.write(
            '\nERROR : there is a problem with the number of species found in the table.\nMake sure you did not manually remove any column from the original output.\n'
        )
        sys.exit(-2)
    # convert the string to int
    spCnt: int = int(spCntStr.split('.', 1)[0])
    # More species requested than those avaliable in the input clusters
    if min > spCnt:
        sys.stderr.write(
            '\nWARNING: {:d} species were found in the input table header, hence clusters with {:d} species cannot exist!.\n'
            .format(spCnt, max))
        sys.stderr.write(
            'Both minimum and maximum will be set to ({:d}).\n'.format(spCnt))
        min = spCnt
        max = spCnt
    # min should lower than max!
    if min > max:
        sys.stderr.write(
            '\nWARNING: the minimum number of species ({:d}) is higher than the maximum number of species ({:d}).\n'
            .format(min, max))
        sys.stderr.write(
            'Max will be set to the maximum number of species in the table ({:d}).\n'
            .format(spCnt))
        max = spCnt

    # extract the species list
    spList: List[str] = []  # will contain the species names
    for i, el in enumerate(hdr_columns[4:-1]):
        if i % 2 == 0:
            spList.append(el)

    # prepare the output file
    if outDir[0] != '/':
        outDir += '/'
    outPath: str = os.path.join(
        outDir,
        'filtered_min{:d}_max{:d}_{:s}'.format(min, max,
                                               os.path.basename(inTbl)))
    # create the output directory if required
    systools.makedir(outDir)
    ofd = open(outPath, 'w')
    # write the header
    ofd.write('{:s}\n'.format('\t'.join(hdr_columns)))

    # output dictionary
    # example: clst105 -> tvaginalis -> [g1, g4, g5]
    outDict: Dict[str, Dict[str, List[str]]] = {}
    extractedClstrCnt: int = 0
    extractedGenesCnt: int = 0
    totCnt: int = 0
    tmpSp: str = ''

    # start looping through the clusters
    for clstr in fd:
        flds: List[str] = clstr.rstrip('\n').split('\t')
        totCnt += 1
        clstrId: str = flds[0]
        spSize: int = int(flds[2])
        # check if it contains all species
        if min <= spSize <= max:
            # write the filtered output file
            ofd.write(clstr)
            # keep only the usable fields
            flds = flds[4:-1]
            # add the id to output dictionary
            outDict[clstrId] = {}
            for i, el in enumerate(flds):
                # extract the cluster
                if i % 2 == 0:
                    # example of cluster
                    # 2336_Q9X2I8,2336_Q9X172:0.159
                    # create the list for the species
                    tmpSp = spList[int(i / 2)]
                    outDict[clstrId][tmpSp] = []
                    for ortho in el.split(','):
                        tmpFlds: List[str] = ortho.split(':')
                        tmpConf: float
                        # case in which multuple columns are the in the gen name
                        if len(tmpFlds) > 3:
                            if ortho[-1] == ':':
                                # for example, x1ab:
                                # it is an ortholog for sure
                                outDict[clstrId][tmpSp].append(ortho)
                                extractedGenesCnt += 1
                                continue
                            else:  # the final field is the confidence
                                if float(tmpFlds[-1]) >= minConf:
                                    # extract and append the gene name
                                    outDict[clstrId][tmpSp].append(':'.join(
                                        tmpFlds[:-1]))
                                    extractedGenesCnt += 1
                                    continue
                        else:  # simpler case
                            if len(tmpFlds) == 2:
                                if float(tmpFlds[-1]) >= minConf:
                                    outDict[clstrId][tmpSp].append(tmpFlds[0])
                                    extractedGenesCnt += 1
                            else:  # then the confidence must be 1.0
                                outDict[clstrId][tmpSp].append(tmpFlds[0])
                                if tmpFlds[0][0] != '*':
                                    extractedGenesCnt += 1
            # increase the count of extracted clusters
            extractedClstrCnt += 1
    fd.close()
    # close output file
    ofd.close()

    # print some debug line
    if debug:
        print('Extracted clusters:\t{:d}'.format(len(outDict)))
        print('Extracted genes:\t{:d}'.format(extractedGenesCnt))
        print('Percentage of extracted clusters:\t{:.2f}'.format(
            round(float(extractedClstrCnt / totCnt) * 100., 2)))
    # return the main dictionary
    return outDict
예제 #9
0
def extract_by_id(inTbl: str,
                  idList: List[str] = [],
                  outDir: str = os.getcwd(),
                  minConf: float = 0.1,
                  debug: bool = False) -> Dict[str, Dict[str, List[str]]]:
    """Extract clusters based on on the number of species of which they are composed."""
    if debug:
        print('\nextract_by_id :: START')
        print('Input groups table:\t{:s}'.format(inTbl))
        print('Number of clusters to be extracted:\t{:d}'.format(len(idList)))
        print('IDs to be extracted:\t{:s}'.format(str(idList)))
        print('Output directory: {:s}'.format(outDir))
        print('Minimum confidence for orthologs:\t{:.2f}'.format(minConf))

    #check that the input directory is valid
    if not os.path.isfile(inTbl):
        sys.stderr.write(
            '\nERROR (file not found): you must provide a valid path to the text file containig the ortholog groups table generated using SonicParanoid.\n'
        )
        sys.exit(-2)
    # Check that ar least one id is in the list
    if len(idList) == 0:
        sys.stderr.write(
            '\nERROR: you must provide at one cluster ID to be extracted, while you have provided none.\n'
        )
        sys.exit(-5)
    # check that there are no repeated IDs in the ID list
    tmpDict: Dict[str, None] = {}
    tmpList: List[str] = []
    for el in idList:
        if not el in tmpDict:
            tmpDict[el] = None
        else:
            tmpList.append(el)
    # remove the repeated IDs if required
    if len(tmpList) > 0:
        for el in tmpList:
            idList.remove(el)
        sys.stderr.write(
            '\nWARNING: the following cluster IDs were repeated in the input ID list and were removed.'
        )
        sys.stderr.write('\n{:s}'.format(str(tmpList)))
        sys.stderr.write(
            '\nThe ID list now contains {:d} cluster IDs.\n\n'.format(
                len(idList)))
    # remove the tmp structure
    del tmpDict
    tmpList.clear()

    # start processing the ortholog groups
    fd = open(inTbl, 'r')
    # extract the header and check the validity of the input file
    hdr_columns: List[str] = fd.readline().rstrip('\n').split('\t')
    # check the hdr
    if not hdr_columns[0] == 'group_id':
        sys.stderr.write('\nERROR: {:s}\nis not a valid header.\n')
        sys.exit(
            'Make sure that the ortholog groups file was generated using SonicParanoid.'
        )
    # extract the species count
    spCntStr: str = str(len(hdr_columns[4:-1]) / 2)
    spCntStr = spCntStr.strip()
    # check that the number of species is valid, for example not column was removed from the file
    # in thise case the dictionary must give a float with ending with '.0'
    if not spCntStr.endswith('.0'):
        sys.stderr.write(
            '\nERROR : there is a problem with the number of species found in the table.\nMake sure you did not manually remove any column from the original output.\n'
        )
        sys.exit(-2)
    # convert the string to int
    spCnt: int = int(spCntStr.split('.', 1)[0])
    # extract the species list
    spList: List[str] = []  # will contain the species names
    for i, el in enumerate(hdr_columns[4:-1]):
        if i % 2 == 0:
            spList.append(el)

    # prepare the output file
    outPath: str = os.path.join(
        outDir, 'filtered_{:s}'.format(os.path.basename(inTbl)))
    # create the output directory if required
    systools.makedir(outDir)
    ofd = open(outPath, 'w')
    # write the header
    ofd.write('{:s}\n'.format('\t'.join(hdr_columns)))

    # output dictionary and other variables
    # example: clst105 -> tvaginalis -> [g1, g4, g5]
    outDict: Dict[str, Dict[str, List[str]]] = {}
    extractedClstrCnt: int = 0
    extractedGenesCnt: int = 0
    totCnt: int = 0
    tmpSp: str = ''

    # start looping through the clusters
    for clstr in fd:
        flds: List[str] = clstr.rstrip('\n').split('\t')
        totCnt += 1
        clstrId: str = flds[0]
        # extract the information from the cluster
        if clstrId in idList:
            # write the filtered output file
            ofd.write(clstr)
            # keep only the usable fields
            flds = flds[4:-1]
            # add the id to output dictionary
            outDict[clstrId] = {}
            for i, el in enumerate(flds):
                # extract the cluster
                if i % 2 == 0:
                    # example of cluster
                    # 2336_Q9X2I8,2336_Q9X172:0.159
                    # create the list for the species
                    tmpSp = spList[int(i / 2)]
                    outDict[clstrId][tmpSp] = []
                    for ortho in el.split(','):
                        tmpFlds: List[str] = ortho.split(':')
                        tmpConf: float
                        # case in which multuple columns are the in the gen name
                        if len(tmpFlds) > 3:
                            if ortho[-1] == ':':
                                # for example, x1ab:
                                # it is an ortholog for sure
                                outDict[clstrId][tmpSp].append(ortho)
                                if outDict[clstrId][tmpSp][-1][0] != '*':
                                    extractedGenesCnt += 1
                                continue
                            else:  # the final field is the confidence
                                if float(tmpFlds[-1]) >= minConf:
                                    # extract and append the gene name
                                    outDict[clstrId][tmpSp].append(':'.join(
                                        tmpFlds[:-1]))
                                    if outDict[clstrId][tmpSp][-1][0] != '*':
                                        extractedGenesCnt += 1
                                    continue
                        else:  # simpler case
                            if len(tmpFlds) == 2:
                                if float(tmpFlds[-1]) >= minConf:
                                    outDict[clstrId][tmpSp].append(tmpFlds[0])
                                    extractedGenesCnt += 1
                            else:  # then the confidence must be 1.0
                                outDict[clstrId][tmpSp].append(tmpFlds[0])
                                if tmpFlds[0][0] != '*':
                                    extractedGenesCnt += 1
            # remove the ID from the list
            idList.remove(clstrId)
            # increase the count of extracted clusters
            extractedClstrCnt += 1
    fd.close()
    # close output file
    ofd.close()

    # print some debug line
    if debug:
        print('Extracted clusters:\t{:d}'.format(len(outDict)))
        if len(idList) > 0:
            print(
                '(WARNING) The following clusters were not found: {:s}'.format(
                    str(idList)))
        print('Extracted genes:\t{:d}'.format(extractedGenesCnt))
        print('Percentage of extracted clusters:\t{:.2f}'.format(
            round(float(extractedClstrCnt / totCnt) * 100., 2)))
    # return the main dictionary
    return outDict
예제 #10
0
def perform_parallel_orthology_inference(requiredPairsDict,
                                         inDir,
                                         outDir=os.getcwd(),
                                         sharedDir=None,
                                         cutoff=40,
                                         confCutoff=0.05,
                                         lenDiffThr=0.5,
                                         threads=8,
                                         debug=False):
    """Execute orthology inference for the required pairs."""
    if debug:
        print('\nperform_parallel_orthology_inference :: START')
        print('Proteome pairs to be processed:\t{:d}'.format(
            len(requiredPairsDict)))
        print('Input directory:{:s}'.format(inDir))
        print('Outdir:{:s}'.format(outDir))
        print('Alignment directory:{:s}'.format(sharedDir))
        print('Cutoff:\t{:d}'.format(cutoff))
        print('Confidence cutoff for paralogs:\t{:s}'.format(str(confCutoff)))
        print('Length difference filtering threshold:\t{:s}'.format(
            str(lenDiffThr)))
        print('CPUs (for mmseqs):\t{:d}'.format(threads))
    # make sure that the directory with alignments exists
    if not os.path.isdir(sharedDir):
        sys.stderr.write(
            'ERROR: The directory with the alignment files\n%s\nwas not found, please provide a valid path\n'
            .format(sharedDir))
    if not os.path.isdir(sharedDir):
        sys.stderr.write(
            'ERROR: The directory with the input files\n{:s}\nwas not found, please provide a valid path\n'
            .format(inDir))
    #create the output directory if does not exist yet
    if outDir != os.getcwd():
        if not os.path.isdir(outDir):
            systools.makedir(outDir)
    if outDir[-1] != '/':
        outDir += '/'
    # check if the output directory differs from the input one
    if os.path.dirname(inDir) == os.path.dirname(outDir):
        sys.stderr.write(
            '\nERROR: the output directory {:s}\nmust be different from the one in which the input files are stored.\n'
            .format(outDir))
        sys.exit(-2)
    # check cutoff
    if cutoff < 30:
        cutoff = 40
    # create the queue and start adding the jobs
    jobs_queue = mp.Queue()

    # fill the queue with the processes
    for pair in requiredPairsDict:
        jobs_queue.put(pair)
    # add flags for eneded jobs
    for i in range(0, threads):
        jobs_queue.put(None)
    #sys.exit('DEBUG :: 3')

    # Queue to contain the execution time
    results_queue = mp.Queue(maxsize=len(requiredPairsDict))
    # call the method inside workers
    runningJobs = [
        mp.Process(target=consume_orthology_inference_jobs,
                   args=(jobs_queue, results_queue, inDir, outDir, sharedDir,
                         cutoff, confCutoff, lenDiffThr, threads, debug))
        for i_ in range(threads)
    ]

    for proc in runningJobs:
        #print('Start job\t{}'.format(proc))
        proc.start()

    # open the file in which the time information will be stored
    execTimeOutPath = os.path.join(
        sharedDir, 'orthology_ex_time_{:s}.tsv'.format(
            os.path.basename(outDir.rstrip('/'))))
    ofd = open(execTimeOutPath, 'w', buffering=1)

    # get the results from the queue without filling the Pipe buffer
    while True:
        try:
            p, val = results_queue.get(False, 0.01)
            ofd.write('{:s}\t{:s}\n'.format(p, str(val)))
        except queue.Empty:
            pass
        allExited = True
        for t in runningJobs:
            if t.exitcode is None:
                allExited = False
                break
        if allExited & results_queue.empty():
            break
    ofd.close()

    for proc in runningJobs:
        while proc.is_alive():
            proc.join()
예제 #11
0
def mmseqs_search(inSeq,
                  dbSeq,
                  dbDir=os.getcwd(),
                  outDir=os.getcwd(),
                  tmpDirName=None,
                  sensitivity=4.0,
                  evalue=1000,
                  threads=4,
                  cleanUp=False,
                  debug=False):
    """Align protein sequences using mmseqs2."""
    if debug:
        print('\nmmseqs_search :: START')
        print('Input query FASTA file:\t%s' % inSeq)
        print('Input target FASTA file:\t%s' % dbSeq)
        print('mmseqs2 database directory:\t%s' % dbDir)
        print('Output directory:\t%s' % outDir)
        print('MMseqs2 tmp directory:\t{:s}'.format(tmpDirName))
        print('MMseqs2 sensitivity (-s):\t%s' % str(sensitivity))
        print('Threads:\t%d' % threads)
        print('Remove temporary files:\t%s' % cleanUp)
    #check that the input file and the database exist
    if not os.path.isfile(inSeq):
        sys.stderr.write(
            'The query file %s was not found, please provide the path to a valid FASTA file'
            % inSeq)
        sys.exit(-2)
    if not os.path.isfile(dbSeq):
        sys.stderr.write(
            'The target file %s was not found, please provide the path to a valid FASTA file'
            % dbSeq)
        sys.exit(-2)
    # check sensitivity
    if (sensitivity < 1) or sensitivity > 8.5:
        sys.stderr.write(
            '\nERROR: the sensitivity value for MMseqs2.0 must be a value between 1.0 and 8.5.\n'
        )
        sys.exit(-5)
    # create directory if not previously created
    systools.makedir(outDir)
    systools.makedir(dbDir)
    # set the tmp dir
    tmpDir = None
    if tmpDirName is None:
        tmpDir = '%stmp_mmseqs/' % outDir
    else:
        tmpDir = '{:s}{:s}/'.format(outDir, tmpDirName)
    systools.makedir(tmpDir)
    # check the query db name
    queryDBname = os.path.basename(inSeq)
    queryDBname = queryDBname.split('.')[
        0]  # take the left part of the file name
    queryDBname = '%s.mmseqs2db' % queryDBname
    queryDBpath = '%s%s' % (dbDir, queryDBname)
    # create the database if does not exist yet
    if not os.path.isfile(queryDBpath):
        mmseqs_createdb(inSeq, outDir=dbDir, debug=debug)
        mmseqs_createindex(queryDBpath, debug=debug)
    # check the target db name
    targetDBname = os.path.basename(dbSeq)
    targetDBname = targetDBname.split('.')[
        0]  # take the left part of the file name
    targetDBname = '%s.mmseqs2db' % targetDBname
    targetDBpath = '%s%s' % (dbDir, targetDBname)
    # create the database if does not exist yet
    if not os.path.isfile(targetDBpath):
        mmseqs_createdb(dbSeq, outDir=dbDir, debug=debug)
        mmseqs_createindex(targetDBpath, debug=debug)
    # set output name
    pairName = '%s-%s' % (os.path.basename(inSeq), os.path.basename(dbSeq))
    rawOutName = 'mmseqs2raw.%s' % pairName
    rawOutPath = '%s%s' % (outDir, rawOutName)
    blastOutName = 'mmseqs2blast.%s' % pairName
    blastOutPath = '%s%s' % (outDir, blastOutName)
    # start measuring the execution time
    # use perf_counter (includes time spent during sleep)
    start_time = time.perf_counter()
    # command to be executed
    minUngappedScore = 15
    # EXAMPLE; mmseqs search queryDBfile targetDBfile outputFile tmpDir -s 7.5 -e 100000 --theads threads
    searchCmd = '{:s} search {:s} {:s} {:s} {:s} -s {:s} --threads {:d} -v 0 --min-ungapped-score {:d} --alignment-mode 2 --alt-ali 10'.format(
        get_mmseqs_path(), queryDBpath, targetDBpath, rawOutPath, tmpDir,
        str(sensitivity), threads, minUngappedScore)
    if debug:
        print('mmseqs2 search CMD:\t%s' % searchCmd)
    # use run (or call)
    subprocess.run(searchCmd, env=my_env, shell=True)
    # output an error if the Alignment did not finish correctly
    if not os.path.isfile(rawOutPath):
        sys.stderr.write(
            '\nERROR: the MMseqs2 raw alignment file was not generated.\n')
        sys.exit(-2)
    # stop counter
    # use perf_counter (includes time spent during sleep)
    end_search = time.perf_counter()
    # use process_time (user + system CPU time, no sleep time)
    #end_search = time.process_time()
    search_time = round(end_search - start_time, 2)
    # convert the output to tab-separated BLAST output
    # EXAMPLE: mmseqs convertalis query.db target.db query_target_rawout query_target_blastout
    convertCmd = '%s convertalis %s %s %s %s -v 0 --format-mode 0' % (
        get_mmseqs_path(), queryDBpath, targetDBpath, rawOutPath, blastOutPath)
    # perform the file conversion
    subprocess.run(convertCmd, env=my_env, shell=True)
    if debug:
        print('mmseqs2 convertalis CMD:\t%s' % convertCmd)
    # exec time conversion
    #convert_time = round(time.time() - end_search, 2)
    # use perf_counter (includes time spent during sleep)
    convert_time = round(time.perf_counter() - end_search, 2)
    # use process_time (user + system CPU time, no sleep time)
    #convert_time = round(time.process_time() - end_search, 2)
    # cleanup output directory
    if cleanUp:
        mmseqs_cleanup(inDir=outDir, debug=debug)
    # output an error if the Alignment could not be converted
    if not os.path.isfile(blastOutPath):
        sys.stderr.write(
            '\nERROR: the MMseqs2 raw alignment could not be converted inot the BLAST alignment format.\n'
        )
        sys.exit(-2)
    return (blastOutPath, search_time, convert_time)
예제 #12
0
def consume_orthology_inference_jobs(jobs_queue,
                                     results_queue,
                                     inDir,
                                     outDir=os.getcwd(),
                                     sharedDir=None,
                                     cutoff=40,
                                     confCutoff=0.05,
                                     lenDiffThr=0.5,
                                     threads=8,
                                     debug=False):
    """Perform orthology inference in parallel."""
    while True:
        current_pair = jobs_queue.get(True, 1)
        if current_pair is None:
            break
        # create the output directory iof needed
        # prepare the run
        sp1, sp2 = current_pair.split('-', 1)
        runDir = os.path.join(outDir, current_pair)
        systools.makedir(runDir)
        inSp1 = os.path.join(inDir, sp1)
        inSp2 = os.path.join(inDir, sp2)
        # check that the input files do exist
        if not os.path.isfile(inSp1):
            sys.stderr.write(
                'ERROR: The input file for {:s} was not found, please provide a valid path\n.'
                .format(sp1))
        if not os.path.isfile(inSp2):
            sys.stderr.write(
                'ERROR: The input file for {:s} was not found, please provide a valid path\n.'
                .format(sp2))
        # prepare the names of the required alignments
        # copy AA
        AA = '{:s}-{:s}'.format(sp1, sp1)
        shPathAA = os.path.join(sharedDir, AA)
        if not os.path.isfile(shPathAA):
            sys.stderr.write(
                'ERROR: The alignment file for {:s} was not found, please generate alignments first\n.'
                .format(AA))
        # copy BB
        BB = '{:s}-{:s}'.format(sp2, sp2)
        shPathBB = os.path.join(sharedDir, BB)
        if not os.path.isfile(shPathBB):
            sys.stderr.write(
                'ERROR: The alignment file for {:s} was not found, please generate alignments first\n.'
                .format(BB))
        # copy AB
        AB = '{:s}-{:s}'.format(sp1, sp2)
        shPathAB = os.path.join(sharedDir, AB)
        if not os.path.isfile(shPathAB):
            sys.stderr.write(
                'ERROR: The alignment file for {:s} was not found, please generate alignments first\n.'
                .format(AB))
        # copy BA
        BA = '{:s}-{:s}'.format(sp2, sp1)
        shPathBA = os.path.join(sharedDir, BA)
        if not os.path.isfile(shPathBA):
            sys.stderr.write(
                'ERROR: The alignment file for {:s} was not found, please generate alignments first\n.'
                .format(BA))
        #sys.exit('DEBUG :: workers :: consume_orthology_inference_jobs :: after files copy')

        # prepare paths for output tables
        outTable = os.path.join(runDir, 'table.{:s}'.format(current_pair))
        outSql = os.path.join(runDir, 'sqltable.{:s}'.format(current_pair))

        # infer orthologs
        # use perf_counter (includes time spent during sleep)
        orthology_prediction_start = time.perf_counter()
        inpyranoid.infer_orthologs(inSp1,
                                   inSp2,
                                   alignDir=sharedDir,
                                   outDir=runDir,
                                   confCutoff=confCutoff,
                                   lenDiffThr=lenDiffThr,
                                   debug=False)
        #sys.exit('DEBUG :: workers :: consume_orthology_inference_jobs :: after orthology')
        #check that all the files have been created
        if not os.path.isfile(outTable):
            sys.stderr.write(
                'WARNING: the ortholog table file %s was not generated.' %
                outTable)
            outTable = None
        if not os.path.isfile(outSql):
            sys.stderr.write('WARNING: the SQL table %s was not generated.' %
                             outSql)
            outSql = None
        #everything went ok!
        # use perf_counter (includes time spent during sleep)
        end_time = time.perf_counter()
        orthology_prediction_tot = round(end_time - orthology_prediction_start,
                                         2)
        #sys.exit('DEBUG :: workers :: consume_orthology_inference_jobs :: orthology done!')
        # add the execution time to the results queue
        results_queue.put((current_pair, str(orthology_prediction_tot)))
        if debug:
            sys.stdout.write(
                '\nOrthology prediction {:s} (seconds):\t{:s}\n'.format(
                    current_pair, str(orthology_prediction_tot)))
예제 #13
0
def filter_ortholog_table(abTbl,
                          a,
                          b,
                          outDir=os.getcwd(),
                          lenThr=0.25,
                          debug=False):
    '''Filter ortholog table based on sequence legths.'''
    if debug:
        print('filter_ortholog_table :: START')
        print('Ortholog table: %s' % abTbl)
        print('Proteome A: %s' % a)
        print('Proteome B: %s' % b)
        print('Output directory: %s' % outDir)
        print('Length difference threshold: %s' % str(lenThr))
    # load sequence lengths for A and B
    lenDictA = load_seq_lengths(a, debug)
    lenDictB = load_seq_lengths(b, debug)

    # load the information from a clusters
    # EXAMPLE
    #OrtoId	Score	OrtoA	OrtoB
    #1	1163	1423_Q9KWU4 1.0	9606_P11498 1.0
    #2	963	1423_P09339 1.0	9606_P21399 1.0 9606_P48200 0.201

    # these will contain info for paralogs for each of the 2 species
    if outDir[-1] != '/':
        outDir = '%s/' % outDir

    # check that the output directory is different from the one of the input file
    inDir = '%s/' % os.path.dirname(abTbl)
    if inDir == outDir:
        sys.exit(
            'The output directory must be different from that of the input table.'
        )
    # create output directory if needed
    systools.makedir(outDir)

    # new table path
    abTblNew = '%s%s' % (outDir, os.path.basename(abTbl))
    # rejected list file path
    rjctTbl = '%s%s' % (outDir,
                        os.path.basename(abTbl.replace('table.', 'rejected.')))
    # new table path
    abTblNew = '%s%s' % (outDir, os.path.basename(abTbl))
    # rejected list file path
    rjctTbl = '%s%s' % (outDir,
                        os.path.basename(abTbl.replace('table.', 'rejected.')))
    # open output files
    ofdNewTbl = open(abTblNew, 'w')
    ofdRjct = open(rjctTbl, 'w')

    # count read and wrote genes
    orthoRdCntA = inparaRdCntA = orthoRdCntB = inparaRdCntB = 0
    orthoWrtCntA = inparaWrtCntA = orthoWrtCntB = inparaWrtCntB = 0

    for ln in open(abTbl):
        if ln[0] == 'O':
            ofdNewTbl.write(ln)
            continue
        ln = ln.rstrip('\n')
        clstrId, score, paraA, paraB = ln.split('\t')

        # extract orthologs and inparalogs from A
        orthoListA, inparaDictRawA = extract_paralogs(paraA, debug=debug)
        orthoRdCntA += len(orthoListA)
        inparaRdCntA += len(inparaDictRawA)

        # If there are InParalogs then check if they should be kept or not
        keptInpaListA = []
        droppedInpaDictA = OrderedDict()
        if len(inparaDictRawA):
            #print('ClstrID:\t%s'%clstrId)
            # set ortholog length for A
            lenListOrthoA = []
            if len(orthoListA) > 1:
                lenListOrthoA = calc_ortholog_leghths(orthoListA,
                                                      lenDictA,
                                                      debug=debug)
            elif len(orthoListA) == 0:
                sys.exit('ERROR: at least one ortholog must be found!')
            else:
                # add the only available length
                lenListOrthoA.append(lenDictA[orthoListA[0]])
            # filter Inparalogs from A
            droppedInpaDictA, keptInpaListA = filter_inparalogs(
                inparaDictRawA,
                lenDictA,
                orthoLenList=lenListOrthoA,
                lenRatioThr=lenThr,
                debug=debug)
        '''
        if clstrId == '777':
            print(orthoListA)
            print(inparaDictRawA)
            print(lenListOrthoA)
            print(droppedInpaDictA)
            print(keptInpaListA)
            sys.exit('Test single cluster')
        #'''

        # extract orthologs and inparalogs from B
        orthoListB, inparaDictRawB = extract_paralogs(paraB, debug=debug)
        orthoRdCntB += len(orthoListB)
        inparaRdCntB += len(inparaDictRawB)

        # If there are InParalogs then check if they should be kept or not
        keptInpaListB = []
        droppedInpaDictB = OrderedDict()
        if len(inparaDictRawB):
            # set ortholog length for B
            lenListOrthoB = []
            if len(orthoListB) > 1:
                lenListOrthoB = calc_ortholog_leghths(orthoListB,
                                                      lenDictB,
                                                      debug=debug)
            elif len(orthoListB) == 0:
                sys.exit('ERROR: at least one ortholog must be found!')
            else:
                # add the only available length
                lenListOrthoB.append(lenDictB[orthoListB[0]])
            # filter Inparalogs from B
            droppedInpaDictB, keptInpaListB = filter_inparalogs(
                inparaDictRawB,
                lenDictB,
                orthoLenList=lenListOrthoB,
                lenRatioThr=lenThr,
                debug=debug)

        # START WRITING THE NEW TABLE
        ofdNewTbl.write('%s\t%s\t' % (clstrId, score))

        # Write the output cluster for A
        # Write cores orthologs for A
        tmpLnList = []
        for orthoTmpGene in orthoListA:
            tmpLnList.append('%s 1.0' % (orthoTmpGene))
        ofdNewTbl.write(' '.join(tmpLnList))
        orthoWrtCntA += len(tmpLnList)

        # Write rejected inparalogs
        for k in droppedInpaDictA:
            tmpLenDiff, tmpConf, inparaVsOrthoRatio = droppedInpaDictA[k]
            ofdRjct.write(
                '%s\t%s\t%s\t%s\t%s\t%s\n' %
                (clstrId, score, k, tmpConf, tmpLenDiff, inparaVsOrthoRatio))
        # reset tmp list
        tmpLnList.clear()
        # write valid inparalogs to cluster
        for tmpInparaA in keptInpaListA:
            tmpLnList.append('%s %s' %
                             (tmpInparaA, inparaDictRawA[tmpInparaA]))
        if len(tmpLnList) > 0:
            ofdNewTbl.write(' %s' % (' '.join(tmpLnList)))
        inparaWrtCntA += len(tmpLnList)
        tmpLnList.clear()

        # now start writing the right part of the cluster
        ofdNewTbl.write('\t')

        # Write core orthologs for B
        tmpLnList.clear()
        for orthoTmpGene in orthoListB:
            tmpLnList.append('%s 1.0' % (orthoTmpGene))
        ofdNewTbl.write(' '.join(tmpLnList))
        orthoWrtCntB += len(tmpLnList)

        # Write rejected inparalogs
        for k in droppedInpaDictB:
            tmpLenDiff, tmpConf, inparaVsOrthoRatio = droppedInpaDictB[k]
            ofdRjct.write(
                '%s\t%s\t%s\t%s\t%s\t%s\n' %
                (clstrId, score, k, tmpConf, tmpLenDiff, inparaVsOrthoRatio))

        # reset tmp list
        tmpLnList.clear()
        # write valid inparalogs to cluster
        for tmpInparaB in keptInpaListB:
            tmpLnList.append('%s %s' %
                             (tmpInparaB, inparaDictRawB[tmpInparaB]))
        if len(tmpLnList) > 0:
            ofdNewTbl.write(' %s' % (' '.join(tmpLnList)))
        inparaWrtCntB += len(tmpLnList)
        tmpLnList.clear()

        # close the cluster line
        ofdNewTbl.write('\n')

        ###### TEST #########
        '''
        # Try to rewrite the table as it was originally
        ofdNewTbl.write('%s\t%s\t'%(clstrId, score))
        tmpLnList = []
        # write cores orthologs for A
        for orthoTmpGene in orthoListA:
            tmpLnList.append('%s 1.0'%(orthoTmpGene))
        ofdNewTbl.write(' '.join(tmpLnList))
        orthoWrtCntA += len(tmpLnList)
        # reset tmp list
        tmpLnList.clear()
        for k in inparaDictRawA:
            tmpLnList.append('%s %s'%(k, inparaDictRawA[k]))
        if len(tmpLnList) > 0:
            ofdNewTbl.write(' %s'%(' '.join(tmpLnList)))
        inparaWrtCntA += len(tmpLnList)
        tmpLnList.clear()
        #'''
        '''
        # now start with the right part of the cluster
        ofdNewTbl.write('\t')
        # write the orthologs for B
        for orthoTmpGene in orthoListB:
            tmpLnList.append('%s 1.0'%(orthoTmpGene))
        ofdNewTbl.write(' '.join(tmpLnList))
        orthoWrtCntB += len(tmpLnList)
        # reset tmp list
        tmpLnList.clear()
        # write InParalogs
        for k in inparaDictRawB:
            tmpLnList.append('%s %s'%(k, inparaDictRawB[k]))
        if len(tmpLnList) > 0:
            ofdNewTbl.write(' %s'%(' '.join(tmpLnList)))
        inparaWrtCntB += len(tmpLnList)
        ofdNewTbl.write('\n')
        ######################
        #'''

    # close output files
    ofdNewTbl.close()
    ofdRjct.close()

    if debug:
        print('\nRead orthologs A/B; inparalogs A/B:')
        print('%d\t%d\t%d\t%d' %
              (orthoRdCntA, orthoRdCntB, inparaRdCntA, inparaRdCntB))
        print('\nWritten orthologs A/B; inparalogs A/B:')
        print('%d\t%d\t%d\t%d' %
              (orthoWrtCntA, orthoWrtCntB, inparaWrtCntA, inparaWrtCntB))