Python fast_gzip_read 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: multiprocess

메소드/함수: fast_gzip_read

hotexamples.com에서의 예제들: 8

Python fast_gzip_read - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 multiprocess.fast_gzip_read에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: __init__.py 프로젝트: krassowski/snp_aaa

def load_transcript_strands(loc):
    transcript_strand = {}
    filename = loc + 'transcript.txt.gz'
    with fast_gzip_read(filename) as f:
        for line in tqdm(f, total=count_lines(filename)):
            data = line.split('\t')
            transcript_strand[data[14]] = int(data[6])
    return transcript_strand

예제 #2

파일 보기

파일: __init__.py 프로젝트: krassowski/snp_aaa

def load_chromosome_and_region_names(loc):
    seq_region = {}
    # seq_region_id, name, cord_system_fk
    filename = loc + 'seq_region.txt.gz'
    with fast_gzip_read(filename) as f:
        for line in tqdm(f, total=count_lines(filename)):
            data = line.split('\t')
            seq_region[int(data[0])] = data[1]
    return seq_region

예제 #3

파일 보기

파일: __init__.py 프로젝트: krassowski/snp_aaa

def load_variation_sources(loc):
    sources = {}
    # 'source_id', 'name', 'version', 'description', 'url', 'type', 'somatic_status', 'data_types'
    filename = loc + 'source.txt.gz'
    with fast_gzip_read(filename) as f:
        for line in tqdm(f, total=count_lines(filename)):
            data = line.split('\t')
            sources[int(data[0])] = data[1]
    gc.collect()
    return sources

예제 #4

파일 보기

def get_cds_positions(transcripts):
    cds_positions = {}
    with fast_gzip_read('ucsc/ref_gene.tsv.gz') as f:
        header = next(f)
        # assert header == '#bin    name    chrom   strand  txStart txEnd   cdsStart        cdsEnd  exonCount       exonStarts      exonEnds        score   name2   cdsStartStat cdsEndStat       exonFrames'
        for line in f:
            data = line.split('\t')
            refseq = data[1]
            if refseq not in transcripts:
                continue
            start, end = map(int, data[6:7 + 1])
            cds_positions[refseq] = (start, end)
    return cds_positions

예제 #5

파일 보기

파일: helpers.py 프로젝트: krassowski/snp_aaa

 def __init__(self, filename=None):
     data = defaultdict(list)
     if not filename:
         filename = self.filename
     if not filename:
         raise ValueError
     with fast_gzip_read(filename, processes=6) as f:
         header = next(f)
         assert header == '#hg19.knownToRefSeq.value	hg19.knownToEnsembl.value\n'
         for line in f:
             line = line.strip().split('\t')
             try:
                 ref_id, unknown_id = line
             except ValueError:
                 continue
             if unknown_id != 'n/a':
                 data[ref_id].append(unknown_id)
     self.data = data

예제 #6

파일 보기

파일: spidex.py 프로젝트: krassowski/snp_aaa

def _get_all_zscores():
    zscores = []
    from multiprocess import fast_gzip_read

    print('Counting...')

    count = count_spidex()

    print('Loading...')

    with fast_gzip_read(SPIDEX_LOCATION) as f:
        header = next(f)
        get_dpsi_zscore = itemgetter(headers.index('dpsi_zscore'))
        for line in tqdm(f, total=count - 1):
            try:
                data = line.rstrip('\n').split('\t')
                # record = SpidexRecord(*data)
                # zscores.append(record.dpsi_zscore)
                zscores.append(float(get_dpsi_zscore(data)))
            except Exception as e:
                print(e)
                continue

    return zscores

예제 #7

파일 보기

파일: expression_database.py 프로젝트: krassowski/snp_aaa

def import_expressed_genes(bdb,
                           tissues=GTEX_TISSUES,
                           path=DEFAULT_PATH,
                           suffix=DEFAULT_GENE_SUFFIX):
    print('Importing expressed genes:')

    count = count_all(tissues, path, suffix)

    with tqdm(total=count) as progress:

        for tissue_name in tissues:
            file_name = tissue_name + suffix
            file_path = os.path.join(path, file_name)
            print('Loading', file_name)

            with fast_gzip_read(file_path) as file_object:
                # skip header
                next(file_object)

                for line in file_object:
                    data = line.split()
                    """
                    gene_id: (
                        gene_name,
                        gene_chr,
                        gene_start,
                        gene_end,
                        strand,
                    )
                    """
                    if not bdb[data[0]]:
                        bdb[data[0]].extend(data[1:6])
                    else:
                        assert bdb[data[0]] == data[1:6]

                    progress.update(1)

예제 #8

파일 보기

파일: expression_database.py 프로젝트: krassowski/snp_aaa

def iterate_over_expression(tissues_list=GTEX_TISSUES,
                            path=DEFAULT_PATH,
                            suffix=DEFAULT_SUFFIX):
    for tissue_name in tissues_list:
        file_name = tissue_name + suffix
        file_path = os.path.join(path, file_name)
        print('Loading', file_name)

        with fast_gzip_read(file_path, processes='all') as file_object:

            header_line = next(file_object)
            header = dict()

            for position, name in enumerate(header_line.split()):
                header[name] = position

            slope_pos = header['slope']
            gene_id_pos = header['gene_id']
            variant_id_pos = header['variant_id']

            for line in file_object:
                data = line.split()
                yield (data[variant_id_pos], tissue_name, data[slope_pos],
                       data[gene_id_pos])