def load_transcript_strands(loc): transcript_strand = {} filename = loc + 'transcript.txt.gz' with fast_gzip_read(filename) as f: for line in tqdm(f, total=count_lines(filename)): data = line.split('\t') transcript_strand[data[14]] = int(data[6]) return transcript_strand
def load_chromosome_and_region_names(loc): seq_region = {} # seq_region_id, name, cord_system_fk filename = loc + 'seq_region.txt.gz' with fast_gzip_read(filename) as f: for line in tqdm(f, total=count_lines(filename)): data = line.split('\t') seq_region[int(data[0])] = data[1] return seq_region
def load_variation_sources(loc): sources = {} # 'source_id', 'name', 'version', 'description', 'url', 'type', 'somatic_status', 'data_types' filename = loc + 'source.txt.gz' with fast_gzip_read(filename) as f: for line in tqdm(f, total=count_lines(filename)): data = line.split('\t') sources[int(data[0])] = data[1] gc.collect() return sources
def get_cds_positions(transcripts): cds_positions = {} with fast_gzip_read('ucsc/ref_gene.tsv.gz') as f: header = next(f) # assert header == '#bin name chrom strand txStart txEnd cdsStart cdsEnd exonCount exonStarts exonEnds score name2 cdsStartStat cdsEndStat exonFrames' for line in f: data = line.split('\t') refseq = data[1] if refseq not in transcripts: continue start, end = map(int, data[6:7 + 1]) cds_positions[refseq] = (start, end) return cds_positions
def __init__(self, filename=None): data = defaultdict(list) if not filename: filename = self.filename if not filename: raise ValueError with fast_gzip_read(filename, processes=6) as f: header = next(f) assert header == '#hg19.knownToRefSeq.value hg19.knownToEnsembl.value\n' for line in f: line = line.strip().split('\t') try: ref_id, unknown_id = line except ValueError: continue if unknown_id != 'n/a': data[ref_id].append(unknown_id) self.data = data
def _get_all_zscores(): zscores = [] from multiprocess import fast_gzip_read print('Counting...') count = count_spidex() print('Loading...') with fast_gzip_read(SPIDEX_LOCATION) as f: header = next(f) get_dpsi_zscore = itemgetter(headers.index('dpsi_zscore')) for line in tqdm(f, total=count - 1): try: data = line.rstrip('\n').split('\t') # record = SpidexRecord(*data) # zscores.append(record.dpsi_zscore) zscores.append(float(get_dpsi_zscore(data))) except Exception as e: print(e) continue return zscores
def import_expressed_genes(bdb, tissues=GTEX_TISSUES, path=DEFAULT_PATH, suffix=DEFAULT_GENE_SUFFIX): print('Importing expressed genes:') count = count_all(tissues, path, suffix) with tqdm(total=count) as progress: for tissue_name in tissues: file_name = tissue_name + suffix file_path = os.path.join(path, file_name) print('Loading', file_name) with fast_gzip_read(file_path) as file_object: # skip header next(file_object) for line in file_object: data = line.split() """ gene_id: ( gene_name, gene_chr, gene_start, gene_end, strand, ) """ if not bdb[data[0]]: bdb[data[0]].extend(data[1:6]) else: assert bdb[data[0]] == data[1:6] progress.update(1)
def iterate_over_expression(tissues_list=GTEX_TISSUES, path=DEFAULT_PATH, suffix=DEFAULT_SUFFIX): for tissue_name in tissues_list: file_name = tissue_name + suffix file_path = os.path.join(path, file_name) print('Loading', file_name) with fast_gzip_read(file_path, processes='all') as file_object: header_line = next(file_object) header = dict() for position, name in enumerate(header_line.split()): header[name] = position slope_pos = header['slope'] gene_id_pos = header['gene_id'] variant_id_pos = header['variant_id'] for line in file_object: data = line.split() yield (data[variant_id_pos], tissue_name, data[slope_pos], data[gene_id_pos])