Пример #1
0
                          "bridgemapper.py")

c_strManualGeneIDs = sfle.d(arepa.path_repo(), sfle.c_strDirEtc,
                            "manual_geneid")

c_pHashBridgeDB = dict(
    (strName,
     sfle.d(arepa.path_arepa(), "GeneMapper", sfle.c_strDirEtc, strDB))
    for strName, strDB in (
        ("H**o sapiens", "Hs_Derby_20120602.bridge"),
        ("Mus musculus", "Mm_Derby_20120602.bridge"),
        ("Saccharomyces cerevisiae", "Sc_Derby_20120602.bridge"),
    ))

pHashBridgeDBTaxIDs = {
    k: arepa.org2taxid(k, True)
    for k in list(c_pHashBridgeDB.keys())
}
m_hashGeneIDs = {
    k: v
    for k, v in
    [x.split("\t") for x in sfle.readcomment(open(c_strManualGeneIDs))]
} if os.path.exists(c_strManualGeneIDs) else None


def funcCounter(iter):
    return ("%02d" % next(iter))


def funcGeneIDMapping(pE,
                      fileDATin,
Пример #2
0

if len(sys.argv) < 2:
    raise Exception("Usage: samples2pcl.py <sdrf.txt> [adf.txt]+")
strSDRF, astrADFs = sys.argv[1], sys.argv[2:]

strTaxID = None
hashSDRF = {}
aastrSDRF = arepa.entable(open(strSDRF), [issource, istable, isdesc, isorg])
for astrLine in aastrSDRF:
    strSource, strTable, strDesc, strOrg = astrLine
    if strSource and strTable:
        hashSDRF[strTable] = (strSource, strDesc)
        hashSDRF[strSource] = (strSource, strDesc)
    if strOrg and (not strTaxID):
        strTaxID = arepa.org2taxid(strOrg)

hashADFs = {}
for strADF in astrADFs:
    aastrADF = arepa.entable(open(strADF), [isprobe, isgene, lambda x: x])
    for astrLine in aastrADF:
        strProbe, strGene, strX = astrLine
        if strProbe and strGene:
            hashADFs[strProbe] = strGene

hashCache = {}
fFirst = True
for strLine in sys.stdin:
    astrLine = strLine.strip().split("\t")
    strID, astrData = astrLine[0], astrLine[1:]
    if fFirst:
Пример #3
0
    pSOFT.open(gzip.open(strGPLGZ))
pMetadata.checksum(' '.join(checksum))
pSOFT.open(sys.stdin)

for pDS in list(pSOFT.get("DATASET").values()):
    pMetadata.pmid(pDS.get_attribute("dataset_pubmed_id"))
    pMetadata.title(pDS.get_attribute("dataset_title"))
    pMetadata.gloss(pDS.get_attribute("dataset_description"))
    pMetadata.type(
        re.sub(r' by .+$', "", (pDS.get_attribute("dataset_type")
                                or "").lower()))
    pMetadata.channels(pDS.get_attribute("dataset_channel_count"))
    pMetadata.conditions(pDS.get_attribute("dataset_sample_count"))
    pMetadata.platform(pDS.get_attribute("dataset_platform"))
    pMetadata.taxid(
        arepa.org2taxid(pDS.get_attribute("dataset_sample_organism")))

# Auxillary Metadata
if strMetadata:
    astrHeaders = None
    for astrLine in csv.reader(open(strMetadata), csv.excel_tab):
        if astrHeaders:
            for i in range(len(astrLine)):
                pMetadata.setdefault(astrHeaders[i], []).append(astrLine[i])
        else:
            pMetadata[c_hashkeyCurated] = astrLine
            astrHeaders = astrLine

# Add Mapping Status and Save
k, v = sfle.readcomment(open(strStatus))[0].split("\t")
pMetadata.update({k: v})
Пример #4
0
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

getinfo.py: 

acquire intermediate information about dataset, 
used to get around pathologies in scons build
"""

import arepa
import sys
import re

pHashRE = {
    "gse": r'Series_platform_taxid\t"([0-9]*)"',
    "gds": r'dataset_sample_organism = ([A-Za-z0-9_\- ]*)'
}

strf = sys.stdin.read()

for strKey, strVal in list(pHashRE.items()):
    astrMatch = re.findall(strVal, strf)
    if astrMatch:
        if strKey == "gse":
            print(astrMatch[0])
        elif strKey == "gds":
            print(arepa.org2taxid(astrMatch[0]))
Пример #5
0
    astrLine = strLine.strip().split("\t")
    if astrLine[0] == "PubMed ID":
        metadatum(pMetadata.pmid, astrLine[1:])
    elif astrLine[0] == "Investigation Title":
        metadatum(pMetadata.title, astrLine[1:])
    elif astrLine[0] == "Experiment Description":
        metadatum(pMetadata.gloss, astrLine[1:])
    elif astrLine[0] == "Experimental Design":
        metadatum(pMetadata.type,
                  [re.sub(r' by .+$', "", s.lower()) for s in astrLine[1:]])
pMetadata.store_checksum()

if strSDRF:
    aastrSDRF = arepa.entable(open(strSDRF), [
        lambda s: s == "Source Name",
        lambda s: re.search(r'Characteristics\s*\[Organism\]', s),
    ])
    metadatum(pMetadata.conditions, [str(len(aastrSDRF))])
    setTaxa = set()
    for astrLine in aastrSDRF:
        strOrg = astrLine[1]
        setTaxa.add(arepa.org2taxid(strOrg) or strOrg)
    metadatum(pMetadata.taxid, list(setTaxa))

for strADF in astrADFs:
    for strLine in open(strADF):
        astrLine = strLine.strip().split("\t")
        if astrLine[0] == "Array Design Name":
            metadatum(pMetadata.platform, astrLine[1:])
pMetadata.save(sys.stdout)