#!/usr/bin/env python """Remove duplicates in a fasta file""" import sys from dp.associations import GeneAssociations from dp.ontology import Ontology from collections import Counter from dp.utils import parseFasta seqs = set() names = set() fastafile = open(sys.argv[1]) MIN_SEQ_LEN = 32 MAX_SEQ_UNK = 0.1 TAXONS_HOMMO_SAPIENS = {9606} asoc = GeneAssociations.fromFile(sys.argv[2], taxons = TAXONS_HOMMO_SAPIENS) ontology = Ontology(sys.argv[3]) ontology.setAssociations(asoc) asoc.transitiveClosure() associated = set() for k,v in asoc.associations.items(): associated.update({g.upper() for g in v}) ss = dict(parseFasta("data/ss.txt")) #print(associated) for l in fastafile: name, typ, *_ = l[1:].split(" ") name = name.upper() seq = next(fastafile) sskey = "%s:secstr" % name.replace("_",":")
dataset = None if options.dataset: # FIXME: When dataset is changed, serialized associations need to be regenerated. This is serious bug if we don't seed random to a constant dataset = [l.strip() for l in open(options.dataset)] random.shuffle(dataset) #assert options.reserve > 0.0 #if options.reserve < 1.0: # Use ratio # splitIndex = int(options.reserve * len(dataset)) #else: # splitIndex = int(options.reserve) #reserved = set(dataset[:splitIndex]) #dataset = set(dataset[splitIndex:]) dataset = set(dataset) associations = GeneAssociations.fromFile(associationsFileName, taxons=TAXONS, dataset=dataset) #reservedAssociations = GeneAssociations.fromFile(associationsFileName+"_reserved", dataset = reserved) ontology.setAssociations(associations) #ontology.setAssociations(reservedAssociations, 'reserved') if options.associationsDump: associations.serialize(options.associationsDump) #reservedAssociations.serialize(options.associationsDump+"_reserved") sys.exit() ontology.deleteSmallTerms(options.lb) associations.shrink(options.max, options.lb) ontology.overView() ontology.dotExport()
#!/usr/bin/env python """Remove duplicates in a fasta file""" import sys from dp.associations import GeneAssociations from dp.ontology import Ontology from collections import Counter from dp.utils import parseFasta seqs = set() names = set() fastafile = open(sys.argv[1]) MIN_SEQ_LEN = 32 MAX_SEQ_UNK = 0.1 TAXONS_HOMMO_SAPIENS = {9606} asoc = GeneAssociations.fromFile(sys.argv[2], taxons=TAXONS_HOMMO_SAPIENS) ontology = Ontology(sys.argv[3]) ontology.setAssociations(asoc) asoc.transitiveClosure() associated = set() for k, v in asoc.associations.items(): associated.update({g.upper() for g in v}) ss = dict(parseFasta("data/ss.txt")) #print(associated) for l in fastafile: name, typ, *_ = l[1:].split(" ") name = name.upper() seq = next(fastafile) sskey = "%s:secstr" % name.replace("_", ":")
dataset = None if options.dataset: # FIXME: When dataset is changed, serialized associations need to be regenerated. This is serious bug if we don't seed random to a constant dataset = [l.strip() for l in open(options.dataset)] random.shuffle(dataset) #assert options.reserve > 0.0 #if options.reserve < 1.0: # Use ratio # splitIndex = int(options.reserve * len(dataset)) #else: # splitIndex = int(options.reserve) #reserved = set(dataset[:splitIndex]) #dataset = set(dataset[splitIndex:]) dataset = set(dataset) associations = GeneAssociations.fromFile(associationsFileName, taxons = TAXONS, dataset = dataset) #reservedAssociations = GeneAssociations.fromFile(associationsFileName+"_reserved", dataset = reserved) ontology.setAssociations(associations) #ontology.setAssociations(reservedAssociations, 'reserved') if options.associationsDump: associations.serialize(options.associationsDump) #reservedAssociations.serialize(options.associationsDump+"_reserved") sys.exit() ontology.deleteSmallTerms(options.lb) associations.shrink(options.max, options.lb) ontology.overView() ontology.dotExport()