Exemplo n.º 1
0
def main():
    options = mainUsage()
    
    script = options.run
    checkDoRun(script)

    try:
        # Read organism common name and related fasta sequence file
        list_file = options.list
        util.checkFile(list_file)
        for line in open(list_file, "r"):
            if line[0] == '!':
                continue
            if line.count('||') < 1:
                continue
            # ! common_name||sequence_file
            common_name, input_file = util.splitLine(line)
            util.checkFile(input_file)
            # Run command
            cmd = 'python -c "from genepy.annotators.%s import doRun; doRun()" -o %s -i %s' % (script, common_name, input_file)
            if util.isLsf():
                job_name = "%s.%s" % (common_name, ANNOTATOR_EXTENSION[script])
                util.submitJob(job_name, cmd, ANNOTATOR_QUEUE[script])
            else:
                util.runProcess(cmd)
    except Exception, e:
        log.error(e)
Exemplo n.º 2
0
def fasta2embl(infasta):
    """
    Transform sequence file format in fasta to embl using EMBOSS seqret
    Returns the name of created embl file
    """
    util.checkFile(infasta)
    outembl = infasta.split(".")[0] + ".embl"
    """
    Usage: seqret 
    Online documentation: http://emboss.open-bio.org/wiki/Appdoc:Seqret
    
      Standard (Mandatory) qualifiers:
      [-sequence]          seqall     (Gapped) sequence(s) filename and optional
                                      format, or reference (input USA)
      [-outseq]            seqoutall  [<sequence>.<format>] Sequence set(s)
                                      filename and optional format (output USA)
    
      The basic USA syntax is one of:
        "file"
        "file:entry"
        "format::file"
        "format::file:entry"
        "database:entry"
        "database"
        "@file"
    """
    # Create EMBOSS seqret command line
    cmd = "seqret -sequence fasta::%s -outseq embl::%s " % (infasta, outembl)
    # Call the subprocess using convenience method
    util.runProcess(cmd)
    logger.info("File", outembl, "created")
    return outembl
Exemplo n.º 3
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-l", "--list", metavar="FILE", help="FILE containing the list of all organism common names to compare", action="store", type="string", dest="list")
    
    (options, args) = parser.parse_args()

    # Print help if no argument given
    if util.printHelp(options):
        parser.print_help()
        sys.exit()
    
    if options.list:
        # Read organism common name and related fasta sequence file
        list_file = options.list
        util.checkFile(list_file)
        for line in open(list_file, "r"):
            if line[0] == '!':
                continue
            # ! common_name
            common_name = line.strip()
                        
            gendb_file = "GenDB/%s.gendb.embl" % common_name
            rast_file = "RAST/%s.rast.embl" % common_name
            img_file = "IMG/%s.img.embl" % common_name
            if not os.path.exists(gendb_file) or not os.path.exists(rast_file) or not os.path.exists(img_file):
                print "No three results for %s" % common_name
                continue
            
            print "Processing %s" % common_name
            doCompare(common_name, gendb_file, rast_file, img_file)
Exemplo n.º 4
0
def splitSeq(dir, embl, type):
    """
    Split sequence into separate file based on CDS features into dir/ directory
    based on EMBOSS extractfeat
    
    Usage: extractfeat
    Online documentation: http://emboss.open-bio.org/wiki/Appdoc:Extractfeat
    
      Standard (Mandatory) qualifiers:
      [-sequence]          seqall     Sequence(s) filename and optional format, or
                                      reference (input USA)
      [-outseq]            seqout     [.] Sequence filename and
                                      optional format (output USA)
   
      Additional (Optional) qualifiers:
       -type               string     [*] By default every feature in the feature
                                      table is extracted. You can set this to be
                                      any feature type you wish to extract.
                                      See http://www.ebi.ac.uk/Services/WebFeat/
                                      for a list of the EMBL feature types and see
                                      the Uniprot user manual in
                                      http://www.uniprot.org/manual/sequence_annotation
                                      for a list of the Uniprot feature types.
                                      The type may be wildcarded by using '*'.
                                      If you wish to extract more than one type,
                                      separate their names with the character '|',
                                      eg:
                                      *UTR | intron (Any string is accepted)
       -featinname         boolean    [N] To aid you in identifying the type of
                                      feature that has been output, the type of
                                      feature is added to the start of the
                                      description of the output sequence.
                                      Sometimes the description of a sequence is
                                      lost in subsequent processing of the
                                      sequences file, so it is useful for the type
                                      to be a part of the sequence ID name. If
                                      you set this to be TRUE then the name is
                                      added to the ID name of the output sequence.

       Associated qualifiers:
       "-outseq" associated qualifiers
       -ossingle2          boolean    Separate file for each entry
       -ofdirectory2       string     Output directory

      The basic USA syntax is one of:
        "file"
        "file:entry"
        "format::file"
        "format::file:entry"
        "database:entry"
        "database"
        "@file"
    """
    util.checkFile(embl)
    # Create directory
    util.createDir(dir)
    cmd = "extractfeat -sequence embl::%s -type %s -featinname YES -outseq fasta:: -osextension2 ffn -ossingle2 Yes -osdirectory2 %s" % (embl, type, dir)
    util.runProcess(cmd)
    logger.info("Sequences extracted into %s" % dir)
Exemplo n.º 5
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option(
        "-l",
        "--list",
        metavar="FILE",
        help="FILE containing the list of all organism common names and its associated parameters depending on submitter type",
        action="store",
        type="string",
        dest="list",
    )
    parser.add_option(
        "-r",
        "--run",
        metavar="SCRIPT",
        help="name of the script to run from %s against each genome of the list" % SUBMIT_SCRIPTS,
        action="store",
        choices=SUBMIT_SCRIPTS,
        dest="run",
    )
    parser.add_option("--submit", help="To submit data, not only for checking", action="store_true", dest="submit")

    (options, args) = parser.parse_args()

    # Print help if no argument given
    if util.printHelp(options):
        parser.print_help()
        sys.exit()

    # Print command line
    cmdline = "$ python "
    for argv in sys.argv:
        cmdline += argv + " "
    log.info(cmdline)

    script = options.run
    try:
        if options.list:
            util.checkFile(options.list)
            if script == "genome_project":
                import submitters.genome_project as genome_project

                genome_project.doRun(options.list, options.submit)
            elif script == "annotated_genome":
                import submitters.annotated_genome as annotated_genome

                annotated_genome.doRun(options.list, options.submit)
        else:
            log.info("Organism list file not provided! Please provide one using -l FILE or --list=FILE")

    except Exception, e:
        import traceback

        log.error(traceback.extract_stack())
        log.error(e)
Exemplo n.º 6
0
def writeFile(data, address):
    writeSocket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
    writeSocket.bind((HOST, randint(1024, 65535)))
    writeSocket.settimeout(TIMEOUT)

    filename = str(data).split('\\x')[2][2:]

    if checkFile(data):
        serviceMsg(f'File already exists.')
        ERRpackage = generatePackage(type=OPCODE['ERROR'], errorCode=6)
        writeSocket.sendto(ERRpackage, address)
        serviceMsg(f'Sent error to client: {address}. Raw data: {ERRpackage}')
        return
    else:

        ACKpackage = generatePackage(type=OPCODE['ACK'], blockNumber=0)
        writeSocket.sendto(ACKpackage, address)
        serviceMsg(f'Sent ACK after WRQ to client: {address}. Raw data: {ACKpackage}. Block number: {0}')

        fileData = getterHandler(writeSocket, BLOCK_SIZE)

        if (fileData is not None):
            writeFileBytes(f'f/{filename}', fileData)
            serviceMsg(f'Successfully wrote data to f/{filename}')
            writeSocket.close()
            return
        else:
            return
Exemplo n.º 7
0
def concatFeatures(embl, features):
    """
    Concat CDS features in embl format into embl sequence file 
      - the first two lines of embl sequence containing ID & XX lines 
      - the CDS features file containing FT lines
      - the rest of embl sequence containing SQ lines
    Returns the name of created embl sequence file
    """
    util.checkFile(embl)
    util.checkFile(features)
    outembl = embl.split(".")[0] + "_with_cds.embl"
    # Create command line
    head_cmd = "head -2 %s > %s; cat %s >> %s;" % (embl, outembl, features, outembl)
    util.runProcess(head_cmd)
    tail_cmd = "tail +3 %s > tail; cat tail >> %s; rm tail;" % (embl, outembl)
    util.runProcess(tail_cmd)
    logger.info("File", outembl, "created")
    return outembl
Exemplo n.º 8
0
def runReciprocalFasta(seq_dir, genome_file, fasta_dir):
    """
    Run FASTA between extracted in-house protein sequences against new genome 
    
    FASTA searches a protein or DNA sequence data bank
     version 35.04 Aug. 25, 2009
     W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
    """
    util.createDir(fasta_dir)
    # Check new genome
    util.checkFile(genome_file)
    # Check ref genome extracted sequences
    util.checkDir(seq_dir)
    res_dir = fasta_dir
    if IS_LSF:
        # Rename new genome sequences for job array to be refgenome_1.faa refgenome_2.faa ...
        seq_num = 0
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            seq_num += 1
            if 'refgenome_' in seq_file and '.faa' in seq_file:
                continue
            seq_newfilepath = "%s/refgenome_%s.faa" % (seq_dir, seq_num)
            seq_filepath = "%s/%s" % (seq_dir, seq_file)
            os.rename(seq_filepath, seq_newfilepath)
        # Submit bsub job array on refgenome_${LSB_JOBINDEX}.faa against mygenome
        bsub_dir = "bsub"
        util.checkDir(bsub_dir)
        cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/refgenome_${LSB_JOBINDEX}.faa %s > %s/refgenome_${LSB_JOBINDEX}.fa" % (seq_dir, genome_file, res_dir)
        util.submitJobArray(jobname="genepy-recipfasta", jobnum=seq_num, jobdir=bsub_dir, cmd=cmd)
        util.submitJobDependency('genepy-recipfasta')
        logger.info("Reciprocal Fasta on LSF finished")
    else:
        # List of inhouse extracted genome sequences
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            res_file = seq_file.split(".")[0] + ".fa"
            cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/%s %s > %s/%s" % (seq_dir, seq_file, genome_file, res_dir, res_file)
            util.runProcess(cmd)
            logger.info(seq_file)
        logger.info("Reciprocal Fasta finished")
Exemplo n.º 9
0
def splitSeqWithBiopython(embl, type):
    """
    Split sequence into separate file based on CDS features into sequences/ directory
    using Biopython
    
    """
    util.checkFile(embl)
    # Create directory sequences/
    dirname = "sequences/"
    util.createDir(dirname)
    record = SeqIO.read(open(embl, "rU"), "embl")
    if len(record.features) == 0:
        sys.exit("ERROR: EMBL file %s without features" % embl)
    for feature in record.features:
        if feature.type == 'CDS':
            seq = record.seq
            
            # Build up a list of (start,end) tuples that will be used to slice the sequence
            locations = []
            # If there are sub_features, then this gene is made up of multiple parts.  
            if len(feature.sub_features): 
                for sf in feature.sub_features:
                    locations.append((sf.location.start.position, sf.location.end.position))
            # This gene is made up of one part.  Store its start and end position.
            else:
                locations.append((feature.location.start.position, feature.location.end.position))

            # Store the joined sequence and nucleotide indices forming the CDS.
            seq_str = '' 
            for begin, end in locations:
                seq_str += seq[begin:end].tostring()

            # Reverse complement the sequence if the CDS is on the minus strand  
            if feature.strand == -1:  
                seq_obj = Seq(seq_str, IUPAC.ambiguous_dna)
                seq_str = seq_obj.reverse_complement().tostring()
            
            logger.debug(feature)
            logger.debug(SeqRecord(seq=Seq(seq_str), id=feature.qualifiers['systematic_id'][0], description=feature.type).format('fasta'))
              
    logger.info("Sequences extracted into %s" % dirname) 
Exemplo n.º 10
0
def readConf(file):
    '''
    :param file: location of config file
    :return: conf file as dict
    '''
    if not checkFile(file):
        logger.error('[{}] : Configuration file not found at {}'.format(
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
            file))
        sys.exit(1)
    file_extension = file.split('.')[-1]
    if file_extension == 'ini':
        parser = SafeConfigParser()
        parser.read(file)
        conf = {}
        for selection in parser.sections():
            inter = {}
            for name, value in parser.items(selection):
                inter[name] = value
            conf[selection] = inter
    elif file_extension == 'yaml' or file_extension == 'yml':
        with open(file) as cf:
            conf = yaml.unsafe_load(cf)
        # try:
        #     with open(file) as cf:
        #         conf = yaml.unsafe_load(cf)
        # except Exception as inst:
        #     logger.error('[{}] : Failed to parse configuration file with {} and {}'.format(
        #         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args))
        #     sys.exit(1)
    else:
        logger.error(
            '[{}] : Unsupported configuration file extension {}'.format(
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                file_extension))
        sys.exit(1)
    return conf
Exemplo n.º 11
0
def main():
    # Fasta file extension: 
    # .ffn for the untranslated nucleotide sequences for each CDS; .faa for protein coding sequences (CDS)
    # .fa for the fasta alignment results
    # .fna for whole genomic DNA sequences; .frn for nucleotide sequences of RNA related features
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-d", "--dna", metavar="FILE", help="input dna FILE in fasta format", action="store", type="string", dest="dna")
    parser.add_option("-t", "--tab", metavar="FILE", help="input tab FILE in embl format", action="store", type="string", dest="tab")
    parser.add_option("-e", "--embl", metavar="FILE", help="input embl FILE with CDS features in embl format", action="store", type="string", dest="embl")
    parser.add_option("--genedb", help="extract reference genome protein sequences from geneDB", action="store_true", dest="db")
    parser.add_option("--fasta", help="run fasta against each extracted in-house genomes", action="store_true", dest="fasta")
    parser.add_option("--hamap", help="run pfscan against HAMAP profiles", action="store_true", dest="hamap")
    parser.add_option("--clean", help="delete all results without deleting reference genomes", action="store_true", dest="clean")
    parser.add_option("--deepclean", help="delete all reference genomes and results", action="store_true", dest="deepclean")
    (options, args) = parser.parse_args()
    # Print help if no argument given
    if util.printHelp(options):
        parser.print_help()
        sys.exit()
    # Print command line
    cmdline = "$ python "
    for argv in sys.argv:
        cmdline += argv + " " 
    logger.debug(cmdline)
    
    # >>> ---------------------------------------------------------------------
    # >>> DATA PREPARATION
    # >>> ---------------------------------------------------------------------
    # List of needed software
    for softname in soft_lists:
        util.checkSoft(softname)
    # Prepare new genome data
    if options.dna and options.tab and not options.embl:
        util.checkFile(options.dna)
        mygenome_emblfile = fasta2embl(options.dna)
        mygenome_emblfile_withcds = concatFeatures(mygenome_emblfile, options.tab)
        splitSeq(mygenome_dir, mygenome_emblfile_withcds, "CDS")
        translateSeq(mygenome_dir)
    elif not options.dna and not options.tab and options.embl:
        mygenome_emblfile_withcds = options.embl
        splitSeq(mygenome_dir, mygenome_emblfile_withcds, "CDS")
        #splitSeqWithBiopython(mygenome_emblfile_withcds, "CDS") # does not work with testdata_01
        translateSeq(mygenome_dir)
    elif not options.deepclean:
        util.checkDir(mygenome_dir)
    # Extract in house genomes from chado db
    if options.db:
        chadoDump(refgenomes_dir)
    elif not options.deepclean:
        util.checkDir(refgenomes_dir)
    # bsub output directory
    if IS_LSF and not (options.clean or options.deepclean):
        util.createDir(bsub_dir)

    # >>> ---------------------------------------------------------------------
    # >>> ORTHOLOG SEARCH
    # >>> ---------------------------------------------------------------------
    # Run fasta & reciprocal fasta
    if options.fasta:
        runFasta(mygenome_dir, refgenomes_dir, fasta_dir)
        fasta_hits = topFastaHits(fasta_dir, refgenomes_extractedseq_dir)
        concatSeq(mygenome_fastafile_allcds, mygenome_dir)
        runReciprocalFasta(refgenomes_extractedseq_dir, mygenome_fastafile_allcds, reciprocalfasta_dir)
        reciprocalfasta_hits = topReciprocalFastaHits(reciprocalfasta_dir)
        printMSPCrunch(fasta_hits, reciprocalfasta_hits)
        hits = getHits(fasta_hits, reciprocalfasta_hits)
        logger.info("ORTHOLOGS")
        logger.info(hits['ortholog'])
        logger.info("SIMILARITY")
        logger.info(hits['similarity'])
        transferFeatures(hits['ortholog'])
    # Run hamap scan
    if options.hamap:
        runHamapScan(mygenome_dir, hamap_dir)

    # >>> ---------------------------------------------------------------------
    # >>> CLEANING OUTPUT DATA
    # >>> ---------------------------------------------------------------------
    # Clean results before a re-run
    if options.clean:
        # fasta results
        util.rmDir(fasta_dir)
        util.rmDir(reciprocalfasta_dir)
        util.rmDir(refgenomes_extractedseq_dir)
        util.rmFile(mygenome_fastafile_allcds)
        # hamap results
        util.rmDir(hamap_dir)
        # bsub outputs
        if IS_LSF:
            util.rmDir(bsub_dir)
    # Deep clean - remove all
    if options.deepclean:
        util.rmDir(refgenomes_dir)
        util.rmDir(mygenome_dir)
        util.rmDir(fasta_dir)
        util.rmDir(reciprocalfasta_dir)
        util.rmDir(refgenomes_extractedseq_dir)
        util.rmFile(mygenome_fastafile_allcds)
        util.rmDir(hamap_dir)