예제 #1
0
"""Remove duplicates in a fasta file"""
import sys
from dp.associations import GeneAssociations
from dp.ontology import Ontology
from collections import Counter
from dp.utils import parseFasta
seqs = set()
names = set()
fastafile = open(sys.argv[1])

MIN_SEQ_LEN = 32
MAX_SEQ_UNK = 0.1

TAXONS_HOMMO_SAPIENS = {9606}
asoc = GeneAssociations.fromFile(sys.argv[2], taxons = TAXONS_HOMMO_SAPIENS)
ontology = Ontology(sys.argv[3])
ontology.setAssociations(asoc)
asoc.transitiveClosure()
associated = set()
for k,v in asoc.associations.items():
    associated.update({g.upper() for g in v})

ss = dict(parseFasta("data/ss.txt"))
#print(associated)

for l in fastafile:
    name, typ, *_ = l[1:].split(" ")
    name = name.upper()
    seq = next(fastafile)
    sskey = "%s:secstr" % name.replace("_",":")
    if typ != 'mol:protein' \
예제 #2
0
                      help="TreeLiker jar binary.")

    options, args = parser.parse_args()

    if len(args) != 2:
        parser.error("Incorect number of arguments!")

    oboFileName, associationsFileName = args

    dp.utils.verbosity = options.verbosity

    if options.backgroundKnowledge:
        with open(options.backgroundKnowledge) as bk:
            Gene.backgroundKnowledge = bk.read().splitlines()

    ontology = Ontology(oboFileName)
    ontology.geneFactory.deserialize = True if options.deserialize is None else False
    #associations = GeneAssociations(associationsFileName, TAXONS_SACCHAROMYCES_CEREVISIAE)

    dataset = None
    if options.dataset:
        # FIXME: When dataset is changed, serialized associations need to be regenerated. This is serious bug if we don't seed random to a constant
        dataset = [l.strip() for l in open(options.dataset)]
        random.shuffle(dataset)
        #assert options.reserve > 0.0
        #if options.reserve < 1.0: # Use ratio
        #    splitIndex = int(options.reserve * len(dataset))
        #else:
        #    splitIndex = int(options.reserve)
        #reserved = set(dataset[:splitIndex])
        #dataset = set(dataset[splitIndex:])
예제 #3
0
"""Remove duplicates in a fasta file"""
import sys
from dp.associations import GeneAssociations
from dp.ontology import Ontology
from collections import Counter
from dp.utils import parseFasta
seqs = set()
names = set()
fastafile = open(sys.argv[1])

MIN_SEQ_LEN = 32
MAX_SEQ_UNK = 0.1

TAXONS_HOMMO_SAPIENS = {9606}
asoc = GeneAssociations.fromFile(sys.argv[2], taxons=TAXONS_HOMMO_SAPIENS)
ontology = Ontology(sys.argv[3])
ontology.setAssociations(asoc)
asoc.transitiveClosure()
associated = set()
for k, v in asoc.associations.items():
    associated.update({g.upper() for g in v})

ss = dict(parseFasta("data/ss.txt"))
#print(associated)

for l in fastafile:
    name, typ, *_ = l[1:].split(" ")
    name = name.upper()
    seq = next(fastafile)
    sskey = "%s:secstr" % name.replace("_", ":")
    if typ != 'mol:protein' \
예제 #4
0

    options, args = parser.parse_args()

    if len(args) != 2:
        parser.error("Incorect number of arguments!")

    oboFileName, associationsFileName = args

    dp.utils.verbosity = options.verbosity

    if options.backgroundKnowledge:
        with open(options.backgroundKnowledge) as bk:
            Gene.backgroundKnowledge = bk.read().splitlines()

    ontology = Ontology(oboFileName)
    ontology.geneFactory.deserialize = True if options.deserialize is None else False
    #associations = GeneAssociations(associationsFileName, TAXONS_SACCHAROMYCES_CEREVISIAE)

    dataset = None
    if options.dataset:
        # FIXME: When dataset is changed, serialized associations need to be regenerated. This is serious bug if we don't seed random to a constant
        dataset = [l.strip() for l in open(options.dataset)]
        random.shuffle(dataset)
        #assert options.reserve > 0.0
        #if options.reserve < 1.0: # Use ratio
        #    splitIndex = int(options.reserve * len(dataset))
        #else:
        #    splitIndex = int(options.reserve)
        #reserved = set(dataset[:splitIndex])
        #dataset = set(dataset[splitIndex:])