예제 #1
0
def collect_collections(organism, organism_file, given_colls, coll_file, all_colls):
    """
    Collects all available collections as a list
    and returns together with number of unique genes in all
    """
    colls = []
    genes = []

    if organism_file is None:
        colls_dir = os.path.join(
            get_resources_dir(),
            'collections',
            organism.split('_')[0]
            )
        for coll in all_colls:
            coll_path = os.path.join(colls_dir, coll + '.json')
            if os.path.exists(coll_path):
                with open(coll_path, 'r') as f: _coll = json.load(f)
                genes.extend([gene_set['genes'] for gene_set in _coll['geneSets']])
                if coll in given_colls:
                    colls.append(_coll)

    if coll_file is not None:
        _coll = _read_gmt_file(coll_file)
        genes.extend([gene_set['genes'] for gene_set in _coll['geneSets']])
        colls.append(_coll)

    # count the number of unique genes in all collections
    colls_size = len(set(chain.from_iterable(genes)))
    return colls, colls_size
예제 #2
0
def generate(organism, organism_file):
    """
    Generates an interval tree of mapping from downloaded file
    """
    # parse either JSON file or given Tab-separated file as mapping
    mapping = []

    if organism_file is None:
        # Read mapping to a list
        mapping_path = path.join(get_resources_dir(), 'mappings', organism + '.json')
        if path.exists(mapping_path):
            # Collect chromosomes in a dictionary as an interval tree
            with open(mapping_path, 'r') as f: mapping = json.load(f)

    else:
        with open(organism_file, 'r') as rows:
            for row in rows:
                cols = row.strip().split('\t')
                if len(cols) == 4:
                    mapping.append({
                        'chrName': cols[0],
                        'start': cols[1],
                        'end': cols[2],
                        'symbol': cols[3],
                    })

    # generate mapping interval tree for fast lookup
    mappingTree = {}
    for each in mapping:
        # have we added that chromosome name before?
        if not mappingTree.has_key(each['chrName']):
            mappingTree[each['chrName']] = IntervalTree()
        mappingTree[each['chrName']].addi(
            int(each['start']), int(each['end']), each['symbol'])

    return mappingTree