"bridgemapper.py") c_strManualGeneIDs = sfle.d(arepa.path_repo(), sfle.c_strDirEtc, "manual_geneid") c_pHashBridgeDB = dict( (strName, sfle.d(arepa.path_arepa(), "GeneMapper", sfle.c_strDirEtc, strDB)) for strName, strDB in ( ("H**o sapiens", "Hs_Derby_20120602.bridge"), ("Mus musculus", "Mm_Derby_20120602.bridge"), ("Saccharomyces cerevisiae", "Sc_Derby_20120602.bridge"), )) pHashBridgeDBTaxIDs = { k: arepa.org2taxid(k, True) for k in list(c_pHashBridgeDB.keys()) } m_hashGeneIDs = { k: v for k, v in [x.split("\t") for x in sfle.readcomment(open(c_strManualGeneIDs))] } if os.path.exists(c_strManualGeneIDs) else None def funcCounter(iter): return ("%02d" % next(iter)) def funcGeneIDMapping(pE, fileDATin,
if len(sys.argv) < 2: raise Exception("Usage: samples2pcl.py <sdrf.txt> [adf.txt]+") strSDRF, astrADFs = sys.argv[1], sys.argv[2:] strTaxID = None hashSDRF = {} aastrSDRF = arepa.entable(open(strSDRF), [issource, istable, isdesc, isorg]) for astrLine in aastrSDRF: strSource, strTable, strDesc, strOrg = astrLine if strSource and strTable: hashSDRF[strTable] = (strSource, strDesc) hashSDRF[strSource] = (strSource, strDesc) if strOrg and (not strTaxID): strTaxID = arepa.org2taxid(strOrg) hashADFs = {} for strADF in astrADFs: aastrADF = arepa.entable(open(strADF), [isprobe, isgene, lambda x: x]) for astrLine in aastrADF: strProbe, strGene, strX = astrLine if strProbe and strGene: hashADFs[strProbe] = strGene hashCache = {} fFirst = True for strLine in sys.stdin: astrLine = strLine.strip().split("\t") strID, astrData = astrLine[0], astrLine[1:] if fFirst:
pSOFT.open(gzip.open(strGPLGZ)) pMetadata.checksum(' '.join(checksum)) pSOFT.open(sys.stdin) for pDS in list(pSOFT.get("DATASET").values()): pMetadata.pmid(pDS.get_attribute("dataset_pubmed_id")) pMetadata.title(pDS.get_attribute("dataset_title")) pMetadata.gloss(pDS.get_attribute("dataset_description")) pMetadata.type( re.sub(r' by .+$', "", (pDS.get_attribute("dataset_type") or "").lower())) pMetadata.channels(pDS.get_attribute("dataset_channel_count")) pMetadata.conditions(pDS.get_attribute("dataset_sample_count")) pMetadata.platform(pDS.get_attribute("dataset_platform")) pMetadata.taxid( arepa.org2taxid(pDS.get_attribute("dataset_sample_organism"))) # Auxillary Metadata if strMetadata: astrHeaders = None for astrLine in csv.reader(open(strMetadata), csv.excel_tab): if astrHeaders: for i in range(len(astrLine)): pMetadata.setdefault(astrHeaders[i], []).append(astrLine[i]) else: pMetadata[c_hashkeyCurated] = astrLine astrHeaders = astrLine # Add Mapping Status and Save k, v = sfle.readcomment(open(strStatus))[0].split("\t") pMetadata.update({k: v})
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. getinfo.py: acquire intermediate information about dataset, used to get around pathologies in scons build """ import arepa import sys import re pHashRE = { "gse": r'Series_platform_taxid\t"([0-9]*)"', "gds": r'dataset_sample_organism = ([A-Za-z0-9_\- ]*)' } strf = sys.stdin.read() for strKey, strVal in list(pHashRE.items()): astrMatch = re.findall(strVal, strf) if astrMatch: if strKey == "gse": print(astrMatch[0]) elif strKey == "gds": print(arepa.org2taxid(astrMatch[0]))
astrLine = strLine.strip().split("\t") if astrLine[0] == "PubMed ID": metadatum(pMetadata.pmid, astrLine[1:]) elif astrLine[0] == "Investigation Title": metadatum(pMetadata.title, astrLine[1:]) elif astrLine[0] == "Experiment Description": metadatum(pMetadata.gloss, astrLine[1:]) elif astrLine[0] == "Experimental Design": metadatum(pMetadata.type, [re.sub(r' by .+$', "", s.lower()) for s in astrLine[1:]]) pMetadata.store_checksum() if strSDRF: aastrSDRF = arepa.entable(open(strSDRF), [ lambda s: s == "Source Name", lambda s: re.search(r'Characteristics\s*\[Organism\]', s), ]) metadatum(pMetadata.conditions, [str(len(aastrSDRF))]) setTaxa = set() for astrLine in aastrSDRF: strOrg = astrLine[1] setTaxa.add(arepa.org2taxid(strOrg) or strOrg) metadatum(pMetadata.taxid, list(setTaxa)) for strADF in astrADFs: for strLine in open(strADF): astrLine = strLine.strip().split("\t") if astrLine[0] == "Array Design Name": metadatum(pMetadata.platform, astrLine[1:]) pMetadata.save(sys.stdout)