"""Remove duplicates in a fasta file""" import sys from dp.associations import GeneAssociations from dp.ontology import Ontology from collections import Counter from dp.utils import parseFasta seqs = set() names = set() fastafile = open(sys.argv[1]) MIN_SEQ_LEN = 32 MAX_SEQ_UNK = 0.1 TAXONS_HOMMO_SAPIENS = {9606} asoc = GeneAssociations.fromFile(sys.argv[2], taxons=TAXONS_HOMMO_SAPIENS) ontology = Ontology(sys.argv[3]) ontology.setAssociations(asoc) asoc.transitiveClosure() associated = set() for k, v in asoc.associations.items(): associated.update({g.upper() for g in v}) ss = dict(parseFasta("data/ss.txt")) #print(associated) for l in fastafile: name, typ, *_ = l[1:].split(" ") name = name.upper() seq = next(fastafile) sskey = "%s:secstr" % name.replace("_", ":") if typ != 'mol:protein' \
help="TreeLiker jar binary.") options, args = parser.parse_args() if len(args) != 2: parser.error("Incorect number of arguments!") oboFileName, associationsFileName = args dp.utils.verbosity = options.verbosity if options.backgroundKnowledge: with open(options.backgroundKnowledge) as bk: Gene.backgroundKnowledge = bk.read().splitlines() ontology = Ontology(oboFileName) ontology.geneFactory.deserialize = True if options.deserialize is None else False #associations = GeneAssociations(associationsFileName, TAXONS_SACCHAROMYCES_CEREVISIAE) dataset = None if options.dataset: # FIXME: When dataset is changed, serialized associations need to be regenerated. This is serious bug if we don't seed random to a constant dataset = [l.strip() for l in open(options.dataset)] random.shuffle(dataset) #assert options.reserve > 0.0 #if options.reserve < 1.0: # Use ratio # splitIndex = int(options.reserve * len(dataset)) #else: # splitIndex = int(options.reserve) #reserved = set(dataset[:splitIndex]) #dataset = set(dataset[splitIndex:])