def collect_collections(organism, organism_file, given_colls, coll_file, all_colls): """ Collects all available collections as a list and returns together with number of unique genes in all """ colls = [] genes = [] if organism_file is None: colls_dir = os.path.join( get_resources_dir(), 'collections', organism.split('_')[0] ) for coll in all_colls: coll_path = os.path.join(colls_dir, coll + '.json') if os.path.exists(coll_path): with open(coll_path, 'r') as f: _coll = json.load(f) genes.extend([gene_set['genes'] for gene_set in _coll['geneSets']]) if coll in given_colls: colls.append(_coll) if coll_file is not None: _coll = _read_gmt_file(coll_file) genes.extend([gene_set['genes'] for gene_set in _coll['geneSets']]) colls.append(_coll) # count the number of unique genes in all collections colls_size = len(set(chain.from_iterable(genes))) return colls, colls_size
def generate(organism, organism_file): """ Generates an interval tree of mapping from downloaded file """ # parse either JSON file or given Tab-separated file as mapping mapping = [] if organism_file is None: # Read mapping to a list mapping_path = path.join(get_resources_dir(), 'mappings', organism + '.json') if path.exists(mapping_path): # Collect chromosomes in a dictionary as an interval tree with open(mapping_path, 'r') as f: mapping = json.load(f) else: with open(organism_file, 'r') as rows: for row in rows: cols = row.strip().split('\t') if len(cols) == 4: mapping.append({ 'chrName': cols[0], 'start': cols[1], 'end': cols[2], 'symbol': cols[3], }) # generate mapping interval tree for fast lookup mappingTree = {} for each in mapping: # have we added that chromosome name before? if not mappingTree.has_key(each['chrName']): mappingTree[each['chrName']] = IntervalTree() mappingTree[each['chrName']].addi( int(each['start']), int(each['end']), each['symbol']) return mappingTree