def getSanitizedContigList(cur):
    '''
    Get a list of sanitized Contig IDs. Returns a dictionary from sanitized to unsanitized contig IDs
    present in the database.
    '''
    q = "SELECT DISTINCT contig_mod FROM contigs;"
    cur.execute(q)
    sanitizedToNot = {}
    for res in cur:
        sanitizedToNot[sanitizeString(res[0], False)] = res[0]
    return sanitizedToNot
def make_region_drawing(genelocs, getcolor, centergenename, maxwidth):
    '''
    Makes a PNG figure for regions with a given color mapping, set of gene locations...

    TODO - Needs better documentation
    TODO make auto-del tempfiles, or pass svg as string
    '''

    imgfileloc = "/tmp/%s.png" %(sanitizeString(centergenename, False))
    
    # Set up an entry genome diagram object
    gd_diagram = GenomeDiagram.Diagram("Genome Region")
    gd_track_for_features = gd_diagram.new_track(1, name="Annotated Features")
    gd_feature_set = gd_track_for_features.new_set()

    # Some basic properties of the figure itself
    arrowshaft_height = 0.3
    arrowhead_length = 0.3
    default_fontsize = 30 # Font size for genome diagram labels
    scale = 20     #AA per px for the diagram

    # Build arrow objects for all of our features.
    for feature in genelocs:
        bordercol=rcolors.white
        if feature.id == centergenename:
            bordercol=rcolors.red
            centerdstart, centerend = int(feature.location.start), int(feature.location.end)
            centerdstrand = feature.strand
        color = getcolor[feature.qualifiers["cluster_id"]]

        
        gd_feature_set.add_feature(feature, name = feature.id,
                                   color=color, border = bordercol, 
                                   sigil="ARROW", arrowshaft_height=arrowshaft_height, arrowhead_length = arrowhead_length,
                                   label=False,  label_angle=20, label_size = default_fontsize
                                   )
    start, end = regionlength(genelocs)
    pagew_px = maxwidth / scale
    #offset so start of gene of interest lines up in all the figures
    midcentergene = abs(centerend - centerdstart)/2 + min(centerdstart, centerend)
    l2mid = abs(midcentergene - start)
    r2mid = abs(midcentergene - end)
    roffset = float((pagew_px/2) - (l2mid/scale))
    loffset = float((pagew_px/2) - (r2mid/scale))

    gd_diagram.draw(format="linear", start=start, end=end, fragments=1, pagesize=(225, pagew_px), xl=(loffset/pagew_px), xr=(roffset/pagew_px) )

    gd_diagram.write(imgfileloc, "PNG")
    #flip for reversed genes
    if centerdstrand == -1:
        os.system("convert -rotate 180 %s %s" % (imgfileloc, imgfileloc))
    return imgfileloc
def getSanitizedContigList(cur):
    """ 
    Get a list of sanitized Contig IDs from an ITEP database.
    
    cur is a SQLite cursor pointing at an ITEP database.

    Returns a dictionary from sanitized to unsanitized contig IDs
    present in the database.    
    """
    q = "SELECT DISTINCT contig_mod FROM contigs;"
    cur.execute(q)
    sanitizedToNot = {}
    for res in cur:
        sanitizedToNot[sanitizeString(res[0], False)] = res[0]
    return sanitizedToNot
示例#4
0
def getSanitizedContigList(cur):
    ''' 
    Get a list of sanitized Contig IDs from an ITEP database.
    
    cur is a SQLite cursor pointing at an ITEP database.

    Returns a dictionary from sanitized to unsanitized contig IDs
    present in the database.    
    '''
    q = "SELECT DISTINCT contig_mod FROM contigs;"
    cur.execute(q)
    sanitizedToNot = {}
    for res in cur:
        sanitizedToNot[sanitizeString(res[0], False)] = res[0]
    return sanitizedToNot
def organismNameToId(orgname, cur, issanitized=False):
    '''
    Given an organism name, return the ID for that organism name.
    Use issanitized = True if the provided organism name has been sanitized
    with the sanitizeString() function
    '''
    q = "SELECT organism, organismid FROM organisms;"
    cur.execute(q)
    orgToId = {}
    for res in cur:
        if issanitized:
            orgToId[sanitizeString(res[0], False)] = res[1]
        else:
            orgToId[res[0]] = res[1]
    if orgname in orgToId:
        return orgToId[orgname]
    else:
        raise ValueError("ERROR: Organism name %s not found in database")
示例#6
0
def organismNameToId(orgname, cur, issanitized = False):
    '''
    Given an organism name, return the ID for that organism name.
    Use issanitized = True if the provided organism name has been sanitized
    with the sanitizeString() function
    '''
    q = "SELECT organism, organismid FROM organisms;"
    cur.execute(q)
    orgToId = {}
    for res in cur:
        if issanitized:
            orgToId[sanitizeString(res[0], False)] = res[1]
        else:
            orgToId[res[0]] = res[1]
    if orgname in orgToId:
        return orgToId[orgname]
    else:
        raise ValueError("ERROR: Organism name %s not found in database")
示例#7
0
def organismIdToName(orgid, cur, issanitized=False):
    '''
    Convert an organism ID (in format \d+\.\d+) into the name of the organism.

    If issanitized is True we expect the format \d+_\d+ instead.
    '''
    q = "SELECT organism, organismid FROM organisms;"
    cur.execute(q)
    idToOrg = {}
    for res in cur:
        if issanitized:
            idToOrg[sanitizeString(res[1], False)] = res[0]
        else:
            idToOrg[res[1]] = res[0]
    if orgid in idToOrg:
        return idToOrg[orgid]
    else:
        raise ValueError("ERROR: Organism ID %s not found in database")
def organismIdToName(orgid, cur, issanitized=False):
    '''
    Convert an organism ID (in format \d+\.\d+) into the name of the organism.

    If issanitized is True we expect the format \d+_\d+ instead.
    '''
    q = "SELECT organism, organismid FROM organisms;"
    cur.execute(q)
    idToOrg = {}
    for res in cur:
        if issanitized:
            idToOrg[sanitizeString(res[1], False)] = res[0]
        else:
            idToOrg[res[1]] = res[0]
    if orgid in idToOrg:
        return idToOrg[orgid]
    else:
        raise ValueError("ERROR: Organism ID %s not found in database")
geneToAnnote = {}
geneToOrganism = {}
sys.stderr.write("Reading gene annotations and organisms from database...\n")

# FIXME - This should call the library functions to get geneinfo for specific sets of genes
# instead of doing this.
con = sqlite3.connect(locateDatabase())
cur = con.cursor()
cur.execute("SELECT * FROM processed;")

for l in cur:
    spl = [ str(s) for s in list(l) ]
    # The SVG parser whines with some special characters (i.e. ' )
    # I use the sanitized version as a key here so that the code will work whether or not the leaf names
    # have been sanitized in the input tree.
    geneToAnnote[sanitizeString(spl[0], False)] = sanitizeString(spl[9], False)
    geneToOrganism[sanitizeString(spl[0], False)] = sanitizeString(spl[1], False)

# Standardize font sizes and tree width
t, ts = prettifyTree(t, show_bootstraps = not options.no_bootstraps)
# Standardize leaf order in equivalent trees (with same root)
t = standardizeTreeOrdering(t)

# Now we try and add the heatmap
# if the user requests it
#
# I borrowed some of this code from the ETE tutorial.
if options.datafile is not None:
    array = t.arraytable
    numcols = len(array.colNames)
    matrix_dist = [i for r in xrange(len(array.matrix))\
        else:
            myorgstr = "%s\t%s" %(myorgstr, "NONE")
    myorgstr = myorgstr.lstrip()
    myline = "%s\t%s\t%s\t%s" %(myrunid, myclusterid, myannote, myorgstr)
    mytable.append(myline)

# Generate SQL table with this info in it.
cur.execute("DROP TABLE IF EXISTS presenceabsence;")

cmd = """CREATE TABLE presenceabsence (
"runid" VARCHAR(128),
"clusterid" INT,
"annote" VARCHAR(2048)"""

for org in orgList:
    cmd += """, %s VARCHAR(128)""" %(sanitizeString(org, False))

cmd += ");"

cur.execute(cmd)

for ln in mytable:
    sp = ln.split("\t")
    cmd = "INSERT INTO presenceabsence VALUES ("
    for s in sp:
        cmd += "?,"
    cmd = cmd.rstrip(",")
    cmd += ");"
    cur.execute(cmd, tuple(sp))
# We have to commit because we added a table.
con.commit()
def kbaseGenomeToGenbank(genome_object, taxid=None):
    '''Convert a KBase genome object into a Genbank file incorporating as much info as we can
    as found in the NCBI genbank files.

    Note - the genome object (not to be confused with a ModelSEED "annotation" object) has both annotations / translations
    AND the DNA sequence. It's obtained by calling annotate_genome on an object that only has the DNA sequence.

    Hopefully they won't change this otherwise I'll have to do more cross-referencing and ask for two files. Sigh...'''

    organism_name = genome_object["scientific_name"]
    organism_domain = genome_object["domain"]
    organism_id = genome_object["id"]
    organism_genetic_code = genome_object["genetic_code"]

    # Get the TaxID
    # If none is specified the user has to provide one (or at least some unique integer, not necessarily a tax ID) for this system to work right.
    if taxid is None:
        # CDMI.py is from the KBase - we need it to get the Taxon ID
        # Download it at http://kbase.science.energy.gov/developer-zone/downloads/
        try:
            from CDMI import CDMI_EntityAPI
        except ImportError:
            sys.stderr.write("ERROR: If no TaxID is provided, the CDMI.py file is necessary (http://kbase.science.energy.gov/developer-zone/downloads/) to attempt to guess it.\n")
            exit(2)
        URL="https://www.kbase.us/services/cdmi_api/"
        cdmi_entity = CDMI_EntityAPI(URL)
        reldict = cdmi_entity.get_relationship_IsInTaxa(organism_id, [], [], ["id"])
        if reldict is None:
            sys.stderr.write("ERROR: TaxID for Organism ID %s not found in the KBase CDMI. You will need to specify it manually if you want it\n" %(organism_id))
            exit(2)
        else:
            taxidlist = getFieldFromRelationship(reldict, "id", "to")
            taxid = taxidlist[0]

    annotations = { 'source': organism_name, 'organism': organism_name }

    # Specify contig data and "source" features for each contig (required by the genbank standard)
    contig_to_sequence = {}
    contig_to_feature_data = {}
    for contig in genome_object["contigs"]:
        contig_to_sequence[contig["id"]] = contig["dna"]
        qualifiers = {}
        qualifiers["organism"] = organism_name
        qualifiers["mol_type"] = "Genomic DNA"
        if taxid is not None:
            qualifiers["db_xref"] = "taxon:%s" %(taxid)
        feature = SeqFeature(FeatureLocation(0, len(contig["dna"])), strand=1, type="source", qualifiers=qualifiers)
        contig_to_feature_data[contig["id"]] = [ feature ]

    # The contig references are inside the feature definitions in the Genome object file, but
    # in a genbank file the features in a contig must all be separated.
    # Therefore I have to keep track of them in one step and then create the SeqRecord objects
    # in a separate step.
    for feature in genome_object["features"]:
        # FIXME - What do I do with things that have more than one location?
        assert(len(feature["location"]) == 1)

        # First lets Deal with start and stop locations...
        # I verified against Pubseed that these semantics and calcualtions are correct, at least
        # for the proteins I checked that are the same between pubseed and KBase...
        loc = feature["location"][0]
        contig = loc[0]
        start = int(loc[1])
        strandstr = loc[2]
        if strandstr == "-":
            strand = -1
        else:
            strand = 1
        featurelen = loc[3]
        if strand == -1:
            stop = start - featurelen + 1
        else:
            stop = start + featurelen - 1
        # Now I need to convert these into Python slicing indexes...because that is what FeatureLocation wants.
        # This includes making the start always less than stop and offsetting the stop by 1 because slide [a,b] only goes up to position b-1
        seqstart = min(start, stop) - 1
        seqstop = max(start, stop)

        feature_id = feature["id"]
        feature_type = feature["type"]

        qualifiers = {}
        # Unfortunately there are features including proteins in the genome objects that have no function (not even "hypothetical protein")
        # Thankfully this isn't a required field in the Genbank file
        if "function" in feature:
            qualifiers["product"] = strip_control_characters(feature["function"])
        if feature_type == "CDS" or feature_type == "peg":
            qualifiers["protein_id"] = feature_id
            qualifiers["translation"] = feature["protein_translation"]

        # "RNA" is not an official type in a GENBANK file.
        # We attempt to figure out based on the annotation whether it is a tRNA, rRNA, or other (misc_RNA) RNA.
        # These are the offiial RNA types (aside from mRNA but those don't have special fields in the Genome object)
        if feature_type == "rna":
            rRNA_finders = [ "rRNA", "ribosomal", "5S", "16S", "23S", "5.8S", "28S", "18S" ]
            tRNA_finders = [ "tRNA", "transfer" ]
            for finder in rRNA_finders:
                if finder in feature["function"]:
                    feature_type = "rRNA"
            for finder in tRNA_finders:
                if finder in feature["function"]:
                    feature_type = "tRNA"
            if feature_type == "rna":
                feature_type = "misc_RNA"

        # I checked that the above formulas give the correct positions in the genbank file (or at least, the same as the PubSEED genabnk files).
        feature = SeqFeature(FeatureLocation(seqstart, seqstop), strand=strand, type=feature_type, id=feature_id, qualifiers=qualifiers)

        # Attach the new features to the appropriate contig...
        if contig in contig_to_feature_data:
            contig_to_feature_data[contig].append(feature)
        else:
            contig_to_feature_data[contig] = [ feature ]

    # Create one record for each contig
    records = []
    for contig in contig_to_feature_data:
        seq = Seq(contig_to_sequence[contig], IUPAC.ambiguous_dna)
        record = SeqRecord(seq, id=sanitizeString(contig, False), description = "%s contig %s" %(organism_name, contig), name=contig, features=contig_to_feature_data[contig], annotations=annotations)
        records.append(record)
    SeqIO.write(records, sys.stdout, "genbank")

    return
if GLOBALBOOTS and NUMBOOTS == 0:
    sys.stderr.write(
        "ERROR: Specifying global bootstrap without any bootstraps doesnt make sense! Did you forget to specify -b?\n"
    )
    exit(2)

# Read the FASTA file from stdin and convert it into a phylip file
# Use list so we actually edit in-place rather than
# just editing a copy that gets destroyed later!
aln = list(AlignIO.read(sys.stdin, "fasta"))

# We will use this to convert back to the IDs in the fasta file
subToReal = {}
for i in range(len(aln)):
    newid = "S%09d" % (i)
    subToReal[newid] = sanitizeString(aln[i].id, False)
    aln[i].id = newid

#############
# Make a temporary random file to place the phylip (must have write permission in the current directory)
#
# For whatever reason, there are no problems with the phylip writing in SeqIO
# but it doesn't work in AlignIO, while for reading the FASTA it's the opposite case.
# Whatever.
#
# This isn't exactly thread-safe
##############
fname = "%d.phi" % (random.randint(0, 2**30))

fid = open(fname, "w")
SeqIO.write(aln, fid, "phylip")
def findGenesByOrganismList(orglist, runid, cl = None, sanitized = False, any_org = False, all_org = False, only_org = False, none_org = False, uniq_org = False, pct_cutoff = None):
    '''Identify clusters that have a specific set of properties with respect to a given set of
    organisms (orglist). The valid properties are ANY, ALL, ONLY, and NONE.

    Specifiy sanitized=TRUE if the organism names passed here are sanitized (spaces, periods, etc. replaced by
    underscores - see sanitizeString.py for the standard way to sanitize names).

    If the list of runid, clusterid, organismid tuples has already been computed, pass it in via the "cl"
    argument to avoid computing it again. Otherwise, it will be (re)computed within this function.

    You can also use the "cl" argument to restrict analysis to a specific set of (run ID, cluster ID) pairs
    by just passing that subset to the function. If no "cl" is passed then it is assumed you want to compare against
    ALL clusters in a run.

    The organisms in "orglist" are considered the "ingroup" and any organisms in the given cluster run but
    NOT in the orglist are considered the "outgroup". Clusters are pulled out according to the following table
    where the number in the entry corresponds to the number of represented ORGANISMS (NOT GENES) IN THE INGROUP
    (other combinations are possible - this is just a representative set of examples):

      Property  | Ingroup |  Outgroup
    +-----------+---------+-----------
      ALL       |  == N   |    >= 0
    +-----------+---------+-----------
      ANY       |  >= 1   |    >= 0
    +-----------+---------+-----------
      ONLY      |  >= 1   |    == 0
    +-----------+---------+-----------
      NONE      |  == 0   |    >= 1*
    +-----------+---------+-----------
     ALL + ONLY |  == N   |    == 0    - Genes that are only found in the ingroup and that are found in all members of the ingroup
    +-----------+---------+-----------
     ANY + ONLY |  >= 1   |    == 0    - Genes that are found only in the ingroup (but not necessarily in all of its members)
    +-----------+---------+-----------
     PCT_CUTOFF | >=PCT*N |    [Normally >=0 but you can also specify ONLY here]
    +-----------|---------+-----------
     ALL + NONE |
     ANY + NONE | Contradictions (raise errors).
     ONLY + NONE|     
    +-----------+---------+-----------    

    *: No clusters have 0 representatives

    N is the number of organisms in the ingroup

    UNIQ specifies that in addition to any other flags, genes in every organism in the ingroup
    must be uniquely represented in the cluster. Some groups definitions of "core genes" are
    satisfied by using AND and UNIQ as constraints.

    The function returns a list of (runid, clusterid) pairs that adhere to the user-specified criteria.

    (TODO - I need to check if it enforces it for
    only the ingroup or for both the ingroup AND the outgroup. We probably want it to only care
    about the ingroup I think).'''


    if all_org and none_org:
        raise ValueError("ERROR: all_org and none_org options are contradictory\n")
    if any_org and none_org:
        raise ValueError("ERROR: any_org and none_org options are contradictory\n")
    if only_org and none_org:
        raise ValueError("ERROR: only_org and none_org options are contradictory\n")
    if not (only_org or all_org or any_org or none_org or pct_cutoff is not None):
        raise ValueError("ERROR: At least one of any_org, all_org, none_org, only_org or a pct_cutoff must be specified.\n")
    if pct_cutoff is not None and (float(pct_cutoff) > 100 or float(pct_cutoff) < 0):
        raise ValueError("ERROR: Percent cutoff must be between 0 and 100.\n")
    if pct_cutoff is not None and ( all_org or any_org or none_org ):
        raise ValueError("ERROR: Cannot specify both a percent cutoff and ANY, ALL or NONE\n")

    if pct_cutoff is not None:
        use_pct_cutoff = True
        pct_cutoff = float(pct_cutoff)
    else:
        use_pct_cutoff = False

    # Change sanitized gene names to un-sanitized gene names using the organisms file.
    if sanitized:
        allOrgsDict = {}
        p = locateOrganismFile()
        orgfile = open(p, "r")
        for line in orgfile:
            spl = line.strip("\r\n").split("\t")
            allOrgsDict[sanitizeString(spl[0], False)] = spl[0]

        for ii in range(len(orglist)):
            orglist[ii] = allOrgsDict[orglist[ii]]

    if cl is None:
        cl = getClusterOrgsByRun(runid)

    previd = -1
    orgset = set(orglist)
    currentorgs = set()
    goodClusters = []
    for l in cl:
        # Basically we slurp up all cluster,org pairs corresponding to a specific
        # cluster and then once we have all of them we check if they are unique WITHIN the ingroup (orgset), have all of the organisms of interest,
        # etc...
        if l[1] != previd:
            if previd != -1:
                (anyok, allok, noneok, onlyok, pctok) = False,False,False,False,False
                # Check ANY
                intersection = orgset & currentorgs
                if len(intersection) > 0:
                    anyok = True
                else:
                    noneok = True
                # Check ALL
                if len(intersection) == len(orgset):
                    allok = True
                # Check ONLY
                diff = currentorgs - orgset
                if len(diff) == 0:
                    onlyok = True
                # Check percent
                if pct_cutoff is not None and len(intersection) >= len(orgset) * pct_cutoff/100.0:
                    pctok = True

                # Our criteria: we can't have any of the options be TRUE and not have the corresponding condition also be true
                if not ( ( any_org and not anyok) or ( all_org and not allok) or (none_org and not noneok)
                         or (only_org and not onlyok) or (uniq_org and not uniqok) or ( use_pct_cutoff and not pctok) ):
                    goodClusters.append( (prevrun, previd) )

            # Reset
            uniqok = True
            currentorgs.clear()
            previd = l[1]
            prevrun = l[0]
        # Bugfix 07-05-13 - unique only has to apply to the ingroup (orgset)
        if l[2] in currentorgs and l[2] in orgset:
            uniqok = False
        currentorgs.add(l[2])

    return goodClusters
parser.add_option("-s", "--sanitized", help="Specify this if the organism IDs are sanitized in the file (fig_xx_yy instead of fig|xx.yy)", 
                  action="store_true", dest="sanitized", default=False)
(options, args) = parser.parse_args()

if options.orgfile == None:
    sys.stderr.write("ERROR: Orgfile (-f orgfile) is a required argument to replaceOrgWithAbbrev\n")
    exit(2)

keeppeg = options.keeppeg

orgAbbrev = {}
fid = open(options.orgfile, "r")
for line in fid:
    spl = line.strip('\r\n').split("\t")
    if options.sanitized:
        orgid = sanitizeString(spl[1], False)
    else:
        orgid = spl[1]

    orgAbbrev[orgid] = sanitizeString(spl[0], False)

for line in fileinput.input("-"):
    myline = line.strip('\r\n')
    # I always replace this since it shouldn't break anything to leave it out anyway
    myline = myline.replace("fig|", "")

    for s in orgAbbrev:
        if keeppeg:
            myline = myline.replace(s, orgAbbrev[s] + "_" + s)
        else:
            myline = myline.replace(s, orgAbbrev[s])
def findGenesByOrganismList(orglist,
                            runid,
                            cl=None,
                            sanitized=False,
                            any_org=False,
                            all_org=False,
                            only_org=False,
                            none_org=False,
                            uniq_org=False):
    '''Identify clusters that have a specific set of properties with respect to a given set of
    organisms. The valid properties are ANY, ALL, ONLY, and NONE.

    Specifiy sanitized=TRUE if the organism names passed here are sanitized (spaces, periods, etc. replaced by
    underscores - see sanitizeString.py for the standard way to sanitize names).

    If the list of runid, clusterid, organismid tuples has already been computed, pass it in via the "cl"
    argument to avoid computing it again. Otherwise, it will be (re)computed within this function.

    You can also use the "cl" argument to restrict analysis to a specific set of (run ID, cluster ID) pairs
    by just passing that subset to the function. If no "cl" is passed then it is assumed you want to compare against
    ALL clusters in a run.

    The organisms in "orglist" are considered the "ingroup" and any organisms in the given cluster run but
    NOT in the orglist are considered the "outgroup". Clusters are pulled out according to the following table
    where the number in the entry corresponds to the number of represented ORGANISMS (NOT GENES) IN THE INGROUP
    (other combinations are possible - this is just a representative set of examples):

      Property  | Ingroup |  Outgroup
    +-----------+---------+-----------
      ALL       |  == N   |    >= 0
    +-----------+---------+-----------
      ANY       |  >= 1   |    >= 0
    +-----------+---------+-----------
      ONLY      |  >= 1   |    == 0
    +-----------+---------+-----------
      NONE      |  == 0   |    >= 1*
    +-----------+---------+-----------
     ALL + ONLY |  == N   |    == 0    - Genes that are only found in the ingroup and that are found in all members of the ingroup
    +-----------+---------+-----------
     ANY + ONLY |  >= 1   |    == 0    - Genes that are found only in the ingroup (but not necessarily in all of its members)
    +-----------+---------+-----------
     ALL + NONE |
     ANY + NONE | Contradictions (raise errors).
     ONLY + NONE|     
    +-----------+---------+-----------    

    *: No clusters have 0 representatives

    N is the number of organisms in the ingroup and O is the number in the outgroup.

    UNIQ specifies that in addition to any other flags, genes in every organism in the ingroup
    must be uniquely represented in the cluster. Some groups definitions of "core genes" are
    satisfied by using AND and UNIQ as constraints.

    The function returns a list of (runid, clusterid) pairs that adhere to the user-specified criteria.

    (TODO - I need to check if it enforces it for
    only the ingroup or for both the ingroup AND the outgroup. We probably want it to only care
    about the ingroup I think).'''

    if all_org and none_org:
        raise ValueError(
            "ERROR: all_org and none_org options are contradictory\n")
    if any_org and none_org:
        raise ValueError(
            "ERROR: any_org and none_org options are contradictory\n")
    if only_org and none_org:
        raise ValueError(
            "ERROR: only_org and none_org options are contradictory\n")
    if not (only_org or all_org or any_org or none_org):
        raise ValueError(
            "ERROR: At least one of any_org, all_org, none_org, or only_org must be specified.\n"
        )

    # Change sanitized gene names to un-sanitized gene names using the organisms file.
    if sanitized:
        allOrgsDict = {}
        p = locateOrganismFile()
        orgfile = open(p, "r")
        for line in orgfile:
            spl = line.strip("\r\n").split("\t")
            allOrgsDict[sanitizeString(spl[0], False)] = spl[0]

        for ii in range(len(orglist)):
            orglist[ii] = allOrgsDict[orglist[ii]]

    if cl is None:
        cl = getClusterOrgsByRun(runid)

    previd = -1
    orgset = set(orglist)
    currentorgs = set()
    goodClusters = []
    for l in cl:
        # Basically we slurp up all cluster,org pairs corresponding to a specific
        # cluster and then once we have all of them we check if they are unique, have all of the organisms of interest,
        # etc...
        if l[1] != previd:
            if previd != -1:
                (anyok, allok, noneok, onlyok) = False, False, False, False
                # Check ANY
                intersection = orgset & currentorgs
                if len(intersection) > 0:
                    anyok = True
                else:
                    noneok = True
                # Check ALL
                if len(intersection) == len(orgset):
                    allok = True
                # Check ONLY
                diff = currentorgs - orgset
                if len(diff) == 0:
                    onlyok = True

                # Our criteria: we can't have any of the options be TRUE and not have the corresponding condition also be true
                if not ((any_org and not anyok) or (all_org and not allok) or
                        (none_org and not noneok) or
                        (only_org and not onlyok) or
                        (uniq_org and not uniqok)):
                    goodClusters.append((prevrun, previd))

            # Reset
            uniqok = True
            currentorgs.clear()
            previd = l[1]
            prevrun = l[0]

        if l[2] in currentorgs:
            uniqok = False
        currentorgs.add(l[2])

    return goodClusters
示例#16
0
# Use the database to get lists of organisms in each cluster that contains a gene in the query GPR...
con = sqlite3.connect(locateDatabase())
cur = con.cursor()

# We want to make sure we get all the organisms for a particular cluster... but we don't need
# all the genes for those.
query1 = "SELECT clusterid,geneid FROM clusterorgs WHERE clusterorgs.runid=? AND clusterorgs.geneid=?"
query2 = "SELECT organism FROM clusterorgs WHERE clusterorgs.runid=? AND clusterorgs.clusterid=?"
query3 = "SELECT DISTINCT organism FROM clusterorgs WHERE clusterorgs.runid = ?"
cluster2orgs = {}
cluster2genes = {}

orglist = set()
cur.execute(query3, (options.runid, ))
for res in cur:
    orglist.add(sanitizeString(res[0], False))

for gene in genelist:
    cur.execute(query1, (options.runid, gene))
    # For genes we only care about the ones actually appearing in our GPRs.
    clusterid = None
    for res in cur:
        clusterid = str(res[0])
        geneid = str(res[1])
        if clusterid in cluster2genes:
            cluster2genes[clusterid].add(geneid)
        else:
            cluster2genes[clusterid] = set()
            cluster2genes[clusterid].add(geneid)
    # Now lets get what organisms are in that cluster.
    cur.execute(query2, (options.runid, clusterid))
    for node in t.traverse():
        if node.is_leaf():
            unsanitized = unsanitizeGeneId(node.name)
            geneinfo = getGeneInfo( [ unsanitized ], cur)
            if len(geneinfo) > 0:
                organism = geneinfo[0][1]
                annotation = geneinfo[0][9]
            else:
                # FIxME - Attempt to get the organism name from the contig for TBLASTN IDs. If that fails we just give up.
                annotation = ""
                try:
                    contig,start,stop = splitTblastn(unsanitized)
                    q = "SELECT organism FROM organisms INNER JOIN contigs ON contigs.organismid = organisms.organismid WHERE contigs.contig_mod=?;"
                    cur.execute(q, (contig,) )
                    for res in cur:
                        organism = res[0]
                except ValueError:
                    organism = ""
            node.name = sanitizeString("%s_%s_%s" %(organism, annotation[0:63], unsanitized), False)
    
    t, ts = prettifyTree(t, title = gene + " cluster regions", show_bootstraps = False, ts=ts)

    os.system("rm test.svg 2> /dev/null")
    t.render("%s.svg" %(options.outfile), tree_style=ts)
    os.system("convert -trim -depth 32 -background transparent %s.svg %s.png" %(options.outfile, options.outfile))

    if options.display:
        t.show(tree_style=ts)

    con.close()
parser.add_option("-k", "--keeppeg", help="Keep PEG ID? (if specified, keeps peg id. If not, throws it away)", action="store_true", dest="keeppeg", default=False)
parser.add_option("-s", "--sanitized", help="Specify this if the organism IDs are sanitized in the file (fig_xx_yy instead of fig|xx.yy)", 
                  action="store_true", dest="sanitized", default=False)
(options, args) = parser.parse_args()

if options.orgfile is None:
    options.orgfile = locateOrganismFile()

keeppeg = options.keeppeg

orgAbbrev = {}
fid = open(options.orgfile, "r")
for line in fid:
    spl = line.strip('\r\n').split("\t")
    if options.sanitized:
        orgid = sanitizeString(spl[1], False)
    else:
        orgid = spl[1]

    orgAbbrev[orgid] = sanitizeString(spl[0], False)

for line in fileinput.input("-"):
    myline = line.strip('\r\n')
    # I always replace this since it shouldn't break anything to leave it out anyway
    myline = myline.replace("fig|", "")

    for s in orgAbbrev:
        if keeppeg:
            myline = myline.replace(s, orgAbbrev[s] + "_" + s)
        else:
            myline = myline.replace(s, orgAbbrev[s])
if options.gene:
    wanted.append("geneid")
if options.ann:
    wanted.append("annotation")
selstr = ",".join(wanted)
query = "SELECT %s FROM processed WHERE processed.geneid=?;" %(selstr)

geneFinder = re.compile("fig\|\d+\.\d+\.peg\.\d+")

for line in fileinput.input("-"):
    st = line.strip("\r\n")
    replist = geneFinder.findall(st)
    for rep in replist:
        cur.execute(query, (rep, ))

        # Get string with which to replace
        annotestr = ""
        for c in cur:
            annotestr = "_".join( [str(s) for s in c] )

        if annotestr == "":
            sys.stderr.write("WARNING: Gene id %s not found in the database - skipping...\n" %(rep))
            continue
        else:
            # Sanitize the annotation and replace it...
            annotestr = sanitizeString(annotestr, False)
            st = st.replace(rep, annotestr)
    print st

con.close()
示例#20
0
                  default=1)
(options, args) = parser.parse_args()

gc = options.gc - 1

if len(args) < 1:
    sys.stderr.write("ERROR: Run ID is required argument.\n")
    exit(2)

con = sqlite3.connect(locateDatabase())
cur = con.cursor()

runid = args[0]

if not os.path.exists(options.directory):
    os.makedirs(options.directory)

for line in fileinput.input("-"):
    spl = line.strip("\r\n").split("\t")
    geneid = spl[gc]
    diagram = makeSingleGeneNeighborhoodDiagram(
        geneid,
        runid,
        cur,
        labeltype=options.labeltype,
        imgfileloc=os.path.join(options.directory,
                                sanitizeString(geneid, False)))
    sys.stderr.write("Saved result to %s\n" % (diagram))

cur.close()
    exit(2)

if GLOBALBOOTS and NUMBOOTS == 0:
    sys.stderr.write("ERROR: Specifying global bootstrap without any bootstraps doesnt make sense! Did you forget to specify -b?\n")
    exit(2)

# Read the FASTA file from stdin and convert it into a phylip file
# Use list so we actually edit in-place rather than
# just editing a copy that gets destroyed later!
aln = list(AlignIO.read(sys.stdin, "fasta"))

# We will use this to convert back to the IDs in the fasta file
subToReal = {}
for i in range(len(aln)):
    newid = "S%09d" %(i)
    subToReal[newid] = sanitizeString(aln[i].id, False)
    aln[i].id = newid

#############
# Make a temporary random file to place the phylip (must have write permission in the current directory)
#
# For whatever reason, there are no problems with the phylip writing in SeqIO
# but it doesn't work in AlignIO, while for reading the FASTA it's the opposite case.
# Whatever.
#
# This isn't exactly thread-safe
##############
fname = "%d.phi" %(random.randint(0,2**30))

fid = open(fname, "w")
SeqIO.write(aln, fid, "phylip")
if options.ann:
    wanted.append("annotation")
selstr = ",".join(wanted)
query = "SELECT %s FROM processed WHERE processed.geneid=?;" % (selstr)

geneFinder = re.compile("fig\|\d+\.\d+\.peg\.\d+")

for line in fileinput.input("-"):
    st = line.strip("\r\n")
    replist = geneFinder.findall(st)
    for rep in replist:
        cur.execute(query, (rep, ))

        # Get string with which to replace
        annotestr = ""
        for c in cur:
            annotestr = "_".join([str(s) for s in c])

        if annotestr == "":
            sys.stderr.write(
                "WARNING: Gene id %s not found in the database - skipping...\n"
                % (rep))
            continue
        else:
            # Sanitize the annotation and replace it...
            annotestr = sanitizeString(annotestr, False)
            st = st.replace(rep, annotestr)
    print st

con.close()
示例#23
0
def findGenesByOrganismList(orglist,
                            runid,
                            cl=None,
                            sanitized=False,
                            any_org=False,
                            all_org=False,
                            only_org=False,
                            none_org=False,
                            uniq_org=False,
                            pct_cutoff=None,
                            outgroup=None):
    '''Identify clusters that have a specific set of properties with respect to a given set of
    organisms (orglist). The valid properties are ANY, ALL, ONLY, and NONE.

    Specifiy sanitized=TRUE if the organism names passed here are sanitized (spaces, periods, etc. replaced by
    underscores - see sanitizeString.py for the standard way to sanitize names).

    If the list of runid, clusterid, organismid tuples has already been computed, pass it in via the "cl"
    argument to avoid computing it again. Otherwise, it will be (re)computed within this function.

    You can also use the "cl" argument to restrict analysis to a specific set of (run ID, cluster ID) pairs
    by just passing that subset to the function. If no "cl" is passed then it is assumed you want to compare against
    ALL clusters in a run. 

    INGROUP: orglist
    OUTGROUP: How this is computed depends on how you call this function.
         By default: The outgroup is computed for each cluster as the group of organisms that are in that cluster but not in the ingroup.
         IF outgroup is passed to this function: The below are evaluated IGNORING any organisms that are not in the ingroup or the outgroup.

    Outgroup if specified is a list of organism names to use as part of the outgrpup.

    Clusters are pulled out according to the following table
    where the number in the entry corresponds to the number of represented ORGANISMS (NOT GENES) IN THE INGROUP
    (other combinations are possible - this is just a representative set of examples):

      Property  | Ingroup |  Outgroup
    +-----------+---------+-----------
      ALL       |  == N   |    >= 0
    +-----------+---------+-----------
      ANY       |  >= 1   |    >= 0
    +-----------+---------+-----------
      ONLY      |  >= 1   |    == 0
    +-----------+---------+-----------
      NONE      |  == 0   |    >= 1*
    +-----------+---------+-----------
     ALL + ONLY |  == N   |    == 0    - Genes that are only found in the ingroup and that are found in all members of the ingroup
    +-----------+---------+-----------
     ANY + ONLY |  >= 1   |    == 0    - Genes that are found only in the ingroup (but not necessarily in all of its members)
    +-----------+---------+-----------
     PCT_CUTOFF | >=PCT*N |    [Normally >=0 but you can also specify ONLY here]
    +-----------|---------+-----------
     ALL + NONE |
     ANY + NONE | Contradictions (raise errors).
     ONLY + NONE|     
    +-----------+---------+-----------    

    *: No clusters have 0 representatives

    N is the number of organisms in the ingroup

    UNIQ specifies that in addition to any other flags, genes in every organism in the ingroup
    must be uniquely represented in the cluster. Some groups definitions of "core genes" are
    satisfied by using AND and UNIQ as constraints.

    The function returns a list of (runid, clusterid) pairs that adhere to the user-specified criteria.
    '''

    if all_org and none_org:
        raise ValueError(
            "ERROR: all_org and none_org options are contradictory\n")
    if any_org and none_org:
        raise ValueError(
            "ERROR: any_org and none_org options are contradictory\n")
    if only_org and none_org:
        raise ValueError(
            "ERROR: only_org and none_org options are contradictory\n")
    if not (only_org or all_org or any_org or none_org
            or pct_cutoff is not None):
        raise ValueError(
            "ERROR: At least one of any_org, all_org, none_org, only_org or a pct_cutoff must be specified.\n"
        )
    if pct_cutoff is not None and (float(pct_cutoff) > 100
                                   or float(pct_cutoff) < 0):
        raise ValueError("ERROR: Percent cutoff must be between 0 and 100.\n")
    if pct_cutoff is not None and (all_org or any_org or none_org):
        raise ValueError(
            "ERROR: Cannot specify both a percent cutoff and ANY, ALL or NONE\n"
        )

    if pct_cutoff is not None:
        use_pct_cutoff = True
        pct_cutoff = float(pct_cutoff)
    else:
        use_pct_cutoff = False

    # Change sanitized gene names to un-sanitized gene names using the organisms file.
    if sanitized:
        allOrgsDict = {}
        p = locateOrganismFile()
        orgfile = open(p, "r")
        for line in orgfile:
            spl = line.strip("\r\n").split("\t")
            allOrgsDict[sanitizeString(spl[0], False)] = spl[0]

        for ii in range(len(orglist)):
            orglist[ii] = allOrgsDict[orglist[ii]]

        if outgroup is not None:
            for ii in range(len(outgroup)):
                outgroup[ii] = allOrgsDict[outgroup[ii]]

    # If no list of cluster\run\organism triplets is specified as an input,
    # we want to test the criteria with all of them.
    if cl is None:
        cl = getClusterOrgsByRun(runid)

    if outgroup is not None:
        outgroup = set(outgroup)

    previd = -1
    orgset = set(orglist)
    currentorgs = set()
    goodClusters = []
    for l in cl:
        # We slurp up all organisms in a specific cluster (storing in currentorgs)
        # and then once we have all of them we check for the specified conditions.
        if l[1] != previd:
            if previd != -1:
                (anyok, allok, noneok, onlyok,
                 pctok) = False, False, False, False, False
                # Check ANY
                intersection = orgset & currentorgs
                if len(intersection) > 0:
                    anyok = True
                else:
                    noneok = True
                # Check ALL
                if len(intersection) == len(orgset):
                    allok = True
                # Check ONLY
                diff = currentorgs - orgset
                if len(diff) == 0:
                    onlyok = True
                # Check percent
                if pct_cutoff is not None and len(
                        intersection) >= len(orgset) * pct_cutoff / 100.0:
                    pctok = True

                # Our criteria: we can't have any of the options be TRUE and not have the corresponding condition also be true
                if not ((any_org and not anyok) or (all_org and not allok) or
                        (none_org and not noneok) or
                        (only_org and not onlyok) or
                        (uniq_org and not uniqok) or
                        (use_pct_cutoff and not pctok)):
                    goodClusters.append((prevrun, previd))

            # Reset
            uniqok = True
            currentorgs.clear()
            previd = l[1]
            prevrun = l[0]

        # Bugfix 07-05-13 - unique only has to apply to the ingroup (orgset)
        if l[2] in currentorgs and l[2] in orgset:
            uniqok = False

        # If an outgroup is specified and the organism isn't in either the ingroup or the outgroup,
        # don't include it in the analysis.
        if outgroup is not None:
            if l[2] not in outgroup and l[2] not in orgset:
                continue

        currentorgs.add(l[2])

    return goodClusters
示例#24
0
The color code is not necessarily consistent across different input genes but is internally consistent for neighborhoods of a given gene."""

parser = optparse.OptionParser(usage=usage, description=description)
parser.add_option("-d", "--directory", help="Directory in which to save neighborhood diagrams. Default is 'geneNeighborhoods'", action="store", 
                  dest="directory", type="str", default='geneNeighborhoods')
parser.add_option("-l", "--labeltype", help="Type of label to use. Valid types are 'aliases' or 'clusterid' (D: aliases)", action="store", dest="labeltype", type="str", default="aliases")
parser.add_option("-g", "--genecol", help="Column number for gene IDs starting from 1 (D: 1)", action="store", dest="gc", type="int", default=1)
(options, args) = parser.parse_args()

gc = options.gc - 1

if len(args) < 1:
    sys.stderr.write("ERROR: Run ID is required argument.\n")
    exit(2)

con = sqlite3.connect(locateDatabase())
cur = con.cursor()

runid = args[0]

if not os.path.exists(options.directory):
    os.makedirs(options.directory)

for line in fileinput.input("-"):
    spl = line.strip("\r\n").split("\t")
    geneid = spl[gc]
    diagram = makeSingleGeneNeighborhoodDiagram(geneid, runid, cur, labeltype = options.labeltype, imgfileloc = os.path.join(options.directory, sanitizeString(geneid, False)))
    sys.stderr.write("Saved result to %s\n" %(diagram))

cur.close()
示例#25
0
    )
    CLEANUP = True

# Read the FASTA file from stdin and convert it into a phylip file
# Use list so we actually edit in-place rather than
# just editing a copy that gets destroyed later!
aln = list(AlignIO.read(sys.stdin, "fasta"))

# We will use this to convert back to the IDs in the fasta file
subToReal = {}
badchars = " (),:;"
for i in range(len(aln)):
    newid = "S%09d" % (i)
    # FastTree automatically sanitizes ID strings for special characters in Newick files ( (),:; and spaces ) but
    # RaxML does not. I need to provide warning of this and replace if necessary
    aln[i].id = sanitizeString(aln[i].id, True)
    subToReal[newid] = aln[i].id
    aln[i].id = newid

# Make a temporary random file to place the phylip (must have write permission in the current directory
# I would've used /tmp/ but RAXML assumes you want the files in the current directory and
# gets very angry when you end up specifying [currentdirectory]/./tmp/...
#
# For whatever reason, there are no problems with the phylip writing in SeqIO
# but it doesn't work in AlignIO, while for reading the FASTA it's the opposite case.
# Whatever.
#
# This isn't exactly thread-safe
fname = "%d.phi" % (random.randint(0, 2**30))

fid = open(fname, "w")
示例#26
0
def kbaseGenomeToGenbank(genome_object, taxid=None):
    '''Convert a KBase genome object into a Genbank file incorporating as much info as we can
    as found in the NCBI genbank files.

    Note - the genome object (not to be confused with a ModelSEED "annotation" object) has both annotations / translations
    AND the DNA sequence. It's obtained by calling annotate_genome on an object that only has the DNA sequence.

    Hopefully they won't change this otherwise I'll have to do more cross-referencing and ask for two files. Sigh...'''

    organism_name = genome_object["scientific_name"]
    organism_domain = genome_object["domain"]
    organism_id = genome_object["id"]
    organism_genetic_code = genome_object["genetic_code"]

    # Get the TaxID
    # If none is specified the user has to provide one (or at least some unique integer, not necessarily a tax ID) for this system to work right.
    if taxid is None:
        # CDMI.py is from the KBase - we need it to get the Taxon ID
        # Download it at http://kbase.science.energy.gov/developer-zone/downloads/
        try:
            from CDMI import CDMI_EntityAPI
        except ImportError:
            sys.stderr.write(
                "ERROR: If no TaxID is provided, the CDMI.py file is necessary (http://kbase.science.energy.gov/developer-zone/downloads/) to attempt to guess it.\n"
            )
            exit(2)
        URL = "https://www.kbase.us/services/cdmi_api/"
        cdmi_entity = CDMI_EntityAPI(URL)
        reldict = cdmi_entity.get_relationship_IsInTaxa(
            organism_id, [], [], ["id"])
        if reldict is None:
            sys.stderr.write(
                "ERROR: TaxID for Organism ID %s not found in the KBase CDMI. You will need to specify it manually if you want it\n"
                % (organism_id))
            exit(2)
        else:
            taxidlist = getFieldFromRelationship(reldict, "id", "to")
            taxid = taxidlist[0]

    annotations = {'source': organism_name, 'organism': organism_name}

    # Specify contig data and "source" features for each contig (required by the genbank standard)
    contig_to_sequence = {}
    contig_to_feature_data = {}
    for contig in genome_object["contigs"]:
        contig_to_sequence[contig["id"]] = contig["dna"]
        qualifiers = {}
        qualifiers["organism"] = organism_name
        qualifiers["mol_type"] = "Genomic DNA"
        if taxid is not None:
            qualifiers["db_xref"] = "taxon:%s" % (taxid)
        feature = SeqFeature(FeatureLocation(0, len(contig["dna"])),
                             strand=1,
                             type="source",
                             qualifiers=qualifiers)
        contig_to_feature_data[contig["id"]] = [feature]

    # The contig references are inside the feature definitions in the Genome object file, but
    # in a genbank file the features in a contig must all be separated.
    # Therefore I have to keep track of them in one step and then create the SeqRecord objects
    # in a separate step.
    for feature in genome_object["features"]:
        # FIXME - What do I do with things that have more than one location?
        assert (len(feature["location"]) == 1)

        # First lets Deal with start and stop locations...
        # I verified against Pubseed that these semantics and calcualtions are correct, at least
        # for the proteins I checked that are the same between pubseed and KBase...
        loc = feature["location"][0]
        contig = loc[0]
        start = int(loc[1])
        strandstr = loc[2]
        if strandstr == "-":
            strand = -1
        else:
            strand = 1
        featurelen = loc[3]
        if strand == -1:
            stop = start - featurelen + 1
        else:
            stop = start + featurelen - 1
        # Now I need to convert these into Python slicing indexes...because that is what FeatureLocation wants.
        # This includes making the start always less than stop and offsetting the stop by 1 because slide [a,b] only goes up to position b-1
        seqstart = min(start, stop) - 1
        seqstop = max(start, stop)

        feature_id = feature["id"]
        feature_type = feature["type"]

        qualifiers = {}
        # Unfortunately there are features including proteins in the genome objects that have no function (not even "hypothetical protein")
        # Thankfully this isn't a required field in the Genbank file
        if "function" in feature:
            qualifiers["product"] = strip_control_characters(
                feature["function"])
        if feature_type == "CDS" or feature_type == "peg":
            qualifiers["protein_id"] = feature_id
            qualifiers["translation"] = feature["protein_translation"]

        # "RNA" is not an official type in a GENBANK file.
        # We attempt to figure out based on the annotation whether it is a tRNA, rRNA, or other (misc_RNA) RNA.
        # These are the offiial RNA types (aside from mRNA but those don't have special fields in the Genome object)
        if feature_type == "rna":
            rRNA_finders = [
                "rRNA", "ribosomal", "5S", "16S", "23S", "5.8S", "28S", "18S"
            ]
            tRNA_finders = ["tRNA", "transfer"]
            for finder in rRNA_finders:
                if finder in feature["function"]:
                    feature_type = "rRNA"
            for finder in tRNA_finders:
                if finder in feature["function"]:
                    feature_type = "tRNA"
            if feature_type == "rna":
                feature_type = "misc_RNA"

        # I checked that the above formulas give the correct positions in the genbank file (or at least, the same as the PubSEED genabnk files).
        feature = SeqFeature(FeatureLocation(seqstart, seqstop),
                             strand=strand,
                             type=feature_type,
                             id=feature_id,
                             qualifiers=qualifiers)

        # Attach the new features to the appropriate contig...
        if contig in contig_to_feature_data:
            contig_to_feature_data[contig].append(feature)
        else:
            contig_to_feature_data[contig] = [feature]

    # Create one record for each contig
    records = []
    for contig in contig_to_feature_data:
        seq = Seq(contig_to_sequence[contig], IUPAC.ambiguous_dna)
        record = SeqRecord(seq,
                           id=sanitizeString(contig, False),
                           description="%s contig %s" %
                           (organism_name, contig),
                           name=contig,
                           features=contig_to_feature_data[contig],
                           annotations=annotations)
        records.append(record)
    SeqIO.write(records, sys.stdout, "genbank")

    return
示例#27
0
    exit(2)

con = sqlite3.connect(locateDatabase())
cur = con.cursor()

# If a run ID is specified, we want to only return columns in that run.
orgsToInclude = None
if options.runid is not None:
    orgsToInclude = set()
    orgs = getOrganismsInClusterRun(options.runid, cur)
    if len(orgs) == 0:
        raise IOError(
            "ERROR: Specified run ID %s does not exist in the database." %
            (options.runid))
    for org in orgs:
        orgsToInclude.add(sanitizeString(org, False))

if options.runid is None and options.clusterid is None:
    cur.execute("SELECT * FROM presenceabsence;")
else:
    if options.clusterid is None:
        cur.execute("SELECT * FROM presenceabsence WHERE runid = ?",
                    (options.runid, ))
    else:
        runid = options.runid
        clustid = options.clusterid
        cur.execute(
            "SELECT * FROM presenceabsence WHERE runid = ? AND clusterid = ?",
            (runid, clustid))

nameorder = []
            else:
                # FIxME - Attempt to get the organism name from the contig for TBLASTN IDs. If that fails we just give up.
                annotation = ""
                organism = ""
                try:
                    contig, start, stop = splitTblastn(unsanitized)
                    if contig in sanitizedToNot:
                        contig = sanitizedToNot[contig]
                    q = "SELECT organism FROM organisms INNER JOIN contigs ON contigs.organismid = organisms.organismid WHERE contigs.contig_mod=?;"
                    cur.execute(q, (contig, ))
                    for res in cur:
                        organism = res[0]
                except ValueError:
                    # Not a tblastn ID.
                    pass
            node.name = sanitizeString(
                "%s_%s_%s" % (organism, annotation[0:63], unsanitized), False)

    t, ts = prettifyTree(t,
                         title=gene + " cluster regions",
                         show_bootstraps=False,
                         ts=ts)

    if options.savepng:
        os.system("rm test.svg 2> /dev/null")
        t.render("%s.svg" % (options.outfile), tree_style=ts)
        os.system(
            "convert -trim -depth 32 -background transparent %s.svg %s.png" %
            (options.outfile, options.outfile))

    if options.display:
        t.show(tree_style=ts)
示例#29
0
    sys.stderr.write("WARNING: specification of -c overwrites -k (CLEANUP) due to technical limitations in RAXML\n")
    CLEANUP = True

# Read the FASTA file from stdin and convert it into a phylip file
# Use list so we actually edit in-place rather than
# just editing a copy that gets destroyed later!
aln = list(AlignIO.read(sys.stdin, "fasta"))

# We will use this to convert back to the IDs in the fasta file
subToReal = {}
badchars = " (),:;"
for i in range(len(aln)):
    newid = "S%09d" %(i)
    # FastTree automatically sanitizes ID strings for special characters in Newick files ( (),:; and spaces ) but
    # RaxML does not. I need to provide warning of this and replace if necessary
    aln[i].id = sanitizeString(aln[i].id, True)
    subToReal[newid] = aln[i].id
    aln[i].id = newid

# Make a temporary random file to place the phylip (must have write permission in the current directory
# I would've used /tmp/ but RAXML assumes you want the files in the current directory and
# gets very angry when you end up specifying [currentdirectory]/./tmp/...
#
# For whatever reason, there are no problems with the phylip writing in SeqIO
# but it doesn't work in AlignIO, while for reading the FASTA it's the opposite case.
# Whatever.
#
# This isn't exactly thread-safe
fname = "%d.phi" %(random.randint(0,2**30))

fid = open(fname, "w")
示例#30
0
geneToAnnote = {}
geneToOrganism = {}
sys.stderr.write("Reading gene annotations and organisms from database...\n")

# FIXME - This should call the library functions to get geneinfo for specific sets of genes
# instead of doing this.
con = sqlite3.connect(locateDatabase())
cur = con.cursor()
cur.execute("SELECT * FROM processed;")

for l in cur:
    spl = [ str(s) for s in list(l) ]
    # The SVG parser whines with some special characters (i.e. ' )
    # I use the sanitized version as a key here so that the code will work whether or not the leaf names
    # have been sanitized in the input tree.
    geneToAnnote[sanitizeString(spl[0], False)] = sanitizeString(spl[9], False)
    geneToOrganism[sanitizeString(spl[0], False)] = sanitizeString(spl[1], False)

######################
# Add annotations and
# larger bootstrap values to tree
######################

for node in t.traverse():
    if node.is_leaf():
        sanitizedName = sanitizeString(node.name, False)
        # Dont' crash because of e.g. outgroups put in. We already warned about this so don't need to do it again.
        if sanitizedName in geneToOrganism and sanitizedName in geneToAnnote:
            newname = "_".join( [ node.name, geneToOrganism[sanitizedName], geneToAnnote[sanitizedName] ] )
            node.name = newname
The color code is not necessarily consistent across different input genes but is internally consistent for neighborhoods of a given gene."""

parser = optparse.OptionParser(usage=usage, description=description)
parser.add_option("-d", "--directory", help="Directory in which to save neighborhood diagrams. Default is 'geneNeighborhoods'", action="store", 
                  dest="directory", type="str", default='geneNeighborhoods')
parser.add_option("-l", "--labeltype", help="Type of label to use. Valid types are 'aliases' or 'clusterid' (D: aliases)", action="store", dest="labeltype", type="str", default="aliases")
parser.add_option("-g", "--genecol", help="Column number for gene IDs starting from 1 (D: 1)", action="store", dest="gc", type="int", default=1)
(options, args) = parser.parse_args()

gc = options.gc - 1

if len(args) < 1:
    sys.stderr.write("ERROR: Run ID is required argument.\n")
    exit(2)

con = sqlite3.connect(locateDatabase())
cur = con.cursor()

runid = args[0]

if not os.path.exists(options.directory):
    os.makedirs(options.directory)

for line in fileinput.input("-"):
    spl = line.strip("\r\n").split("\t")
    geneid = spl[gc]
    diagram = makeSingleGeneNeighborhoodDiagram(geneid, runid, cur, labeltype = options.labeltype, imgfileloc = os.path.join(options.directory, sanitizeString(geneid, False)))
    sys.stderr.write("Saved result to %s\n" %(diagram))

cur.close()
if options.iteponly and options.useronly:
    sys.stderr.write("ERROR: Cannot ask for both only itep and only user-specified genes\n")
    exit(2)

con = sqlite3.connect(locateDatabase())
cur = con.cursor()

# If a run ID is specified, we want to only return columns in that run.
orgsToInclude = None
if options.runid is not None:
    orgsToInclude = set()
    orgs = getOrganismsInClusterRun(options.runid, cur)
    if len(orgs) == 0:
        raise IOError("ERROR: Specified run ID %s does not exist in the database." %(options.runid) )        
    for org in orgs:
        orgsToInclude.add(sanitizeString(org, False))

if options.runid is None and options.clusterid is None:
    cur.execute("SELECT * FROM presenceabsence;")
else:
    if options.clusterid is None:
        cur.execute("SELECT * FROM presenceabsence WHERE runid = ?", (options.runid,))
    else: 
        runid = options.runid
        clustid = options.clusterid
        cur.execute("SELECT * FROM presenceabsence WHERE runid = ? AND clusterid = ?", (runid, clustid))

nameorder = []
if options.treeorder is not None:
    nameorder = treeorder(options.treeorder)
        else:
            myorgstr = "%s\t%s" % (myorgstr, "NONE")
    myorgstr = myorgstr.lstrip()
    myline = "%s\t%s\t%s\t%s" % (myrunid, myclusterid, myannote, myorgstr)
    mytable.append(myline)

# Generate SQL table with this info in it.
cur.execute("DROP TABLE IF EXISTS presenceabsence;")

cmd = """CREATE TABLE presenceabsence (
"runid" VARCHAR(128),
"clusterid" INT,
"annote" VARCHAR(2048)"""

for org in orgList:
    cmd += """, %s VARCHAR(128)""" % (sanitizeString(org, False))

cmd += ");"

cur.execute(cmd)

for ln in mytable:
    sp = ln.split("\t")
    cmd = "INSERT INTO presenceabsence VALUES ("
    for s in sp:
        cmd += "?,"
    cmd = cmd.rstrip(",")
    cmd += ");"
    cur.execute(cmd, tuple(sp))
# We have to commit because we added a table.
con.commit()
# Use the database to get lists of organisms in each cluster that contains a gene in the query GPR...
con = sqlite3.connect(locateDatabase())
cur = con.cursor()

# We want to make sure we get all the organisms for a particular cluster... but we don't need
# all the genes for those.
query1 = "SELECT clusterid,geneid FROM clusterorgs WHERE clusterorgs.runid=? AND clusterorgs.geneid=?"
query2 = "SELECT organism FROM clusterorgs WHERE clusterorgs.runid=? AND clusterorgs.clusterid=?"
query3 = "SELECT DISTINCT organism FROM clusterorgs WHERE clusterorgs.runid = ?"
cluster2orgs = {}
cluster2genes = {}

orglist = set()
cur.execute(query3, (options.runid, ))
for res in cur:
    orglist.add(sanitizeString(res[0], False))

for gene in genelist:
    cur.execute(query1, (options.runid, gene))
    # For genes we only care about the ones actually appearing in our GPRs.
    clusterid = None
    for res in cur:
        clusterid = str(res[0])
        geneid = str(res[1])
        if clusterid in cluster2genes:
            cluster2genes[clusterid].add(geneid)
        else:
            cluster2genes[clusterid] = set()
            cluster2genes[clusterid].add(geneid)
    # Now lets get what organisms are in that cluster.
    cur.execute(query2, (options.runid, clusterid))