Exemplo n.º 1
Arquivo: test1.py Projeto: numpde/cbb
    class DATA:
        I = download(

        cDNA = download(
 def test_check_all(self):
     for group in URLS:
         with download(URLS[group]).now.open(mode='rb') as fd:
             with open_maybe_gz(fd) as fd:
                 from idiva.io.ass import check_all
                 for check in check_all(fd):
                     print(group, check)
Exemplo n.º 3
def make_df_meta_ucsc():
    with download(URLS['meta']).now.open(mode='r') as fd:
        df = pandas.read_table(fd, index_col=0)
        df = df.assign(celltype=df.Renamed_clusternames)
        p = re.compile(r"_([0-9]+)[.]tab[.]([A-Z][0-9]+)")
        df.index = ["SS2_16_{}_{}".format(*tcga.utils.unlist1(p.findall(i))) for i in df.index]
        return df
Exemplo n.º 4
 def test_makes_folder(self):
     with TemporaryDirectory() as tempdir:
         folder = Path(tempdir) / "test"
         with self.assertRaises(ValueError):
             x = download("-").to(abs_path=folder).now
     self.assertTrue(not folder.exists())
Exemplo n.º 5
class param:
    genera = list(

    class urls:
        descriptors = "https://api.ncbi.nlm.nih.gov/datasets/v1alpha/assembly_descriptors/organism/"
Exemplo n.º 6
def make_df_desc() -> pandas.DataFrame:
    import warnings
    with download(URLS['sc_description']).now.open(mode='rb') as fd:
        with warnings.catch_warnings():
            values = openpyxl.load_workbook(fd).active.values
        # Note: order of arguments matters
        df = pandas.DataFrame(columns=next(values), data=list(values))
        df = df.rename(columns={'GSM ID': "gsm", 'annoated cell types': "celltype"})
        return df
 def test_makes_df_ctrl(self):
     # This takes about 20min on v1/v2
     from idiva.clf.df import v0_df
     from idiva.io import ReadVCF
     with download(URLS['ctrl']).now.open(mode='rb') as fd:
         with open_maybe_gz(fd) as fd:
             assert isinstance(fd, io.TextIOBase)
             df = v0_df(ReadVCF(fd))
             self.assertTrue(len(df) > 0)
             self.assertEqual(len(df), ref_len_v2['ctrl'])
Exemplo n.º 8
 def test_count(self):
     from idiva.io.vcf import ReadVCF, RawDataline
     # ref_len_v1 = {'ctrl': 2329288, 'case': 2360972}
     ref_len_v2 = {'ctrl': 2227080, 'case': 2258797}
     for group in URLS:
         with download(URLS[group]).now.open(mode='rb') as fd:
             with open_maybe_gz(fd, mode='r') as fd:
                 assert isinstance(fd, io.TextIOBase)
                 nlines = sum(1 for __ in ReadVCF(fd))
                 # print(F"Group {group} has {nlines} datalines")
                 self.assertEqual(nlines, ref_len_v2[group])
    def test_combines(self):
        from idiva.io import ReadVCF
        from idiva.clf.df import v0_df, join
        dfs = {}

        for k in URLS:
            with download(URLS[k]).now.open(mode='rb') as fd:
                with open_maybe_gz(fd) as fd:
                    assert isinstance(fd, io.TextIOBase)
                    dfs[k] = v0_df(ReadVCF(fd))

        df = join(case=dfs['case'], ctrl=dfs['ctrl'])
Exemplo n.º 10
def make_df_expr() -> pandas.DataFrame:
    with download(URLS['GSE98816']).now.open(mode='rb') as fd:
        # samples x genes
        df_expr = pandas.read_table(fd, compression='gzip', quotechar='"', index_col=0).T

    # Sort by sample ID
    df_expr = df_expr.sort_index()
    df_expr.index.name = "sample"

    assert df_expr.index.is_unique

    # Also remove common prefix to match df_meta
    df_expr.index = df_expr.index.str.slice(len(os.path.commonprefix(list(df_expr.index))))
    assert df_expr.index.is_unique
    return df_expr
    def test_chi2_large(self):
        from idiva.io import ReadVCF, open_maybe_gz
        from idiva.clf.df import v0_df, join

        dfs = {}

        for k in URLS_LARGE:
            with download(URLS_LARGE[k]).now.open(mode='rb') as fd:
                with open_maybe_gz(fd, mode='r') as fd:
                    dfs[k] = v0_df(ReadVCF(fd))

        df = join(case=dfs['case'], ctrl=dfs['ctrl'])

        cols = tuple([F"ALT{n}_{kind}" for n in range(3)]
                     for kind in ['case', 'ctrl'])

        p = chi2_test(df[cols[0] + cols[1]], cols, add=1)
Exemplo n.º 12
def make_df_meta() -> pandas.DataFrame:
    import xml.etree.ElementTree as ET
    import tarfile
    import re
    import pandas as pd
    from tcga.utils import unlist1

    with download(URLS['GSE98816_miniml']).now.open(mode='rb') as tf:
        with tarfile.open(fileobj=tf, mode='r') as tar:
            et = ET.parse(source=tar.extractfile(unlist1(tar))).getroot()

            # Namespace a la '{http://www.ncbi.nlm.nih.gov/geo/info/MINiML}'
            ns = unlist1(re.findall(r"({.*}).*", et.tag))

            c1: ET.Element
            # c1 = first(et.findall(ns + "Sample"))
            df_meta = pd.DataFrame(
                        'gsm': c1.attrib['iid'],
                        'sra': unlist1(c1.findall("./*/[@type='SRA']")).attrib["target"].strip(),
                        'taxid': unlist1(c1.findall("*/*/[@taxid]")).attrib["taxid"].strip(),
                        'biosample': unlist1(c1.findall("./*/[@type='BioSample']")).attrib["target"].strip(),
                        'strain': unlist1(c1.findall("*/*/[@tag='strain']")).text.strip().lower(),
                        'tissue': unlist1(c1.findall("*/*/[@tag='tissue']")).text.strip().lower(),
                        'genotype': unlist1(c1.findall("*/*/[@tag='genotype']")).text.strip().lower(),
                        'age': unlist1(c1.findall("*/*/[@tag='age']")).text.strip().lower(),
                        'title': unlist1(c1.findall(ns + "Title")).text.strip(),
                        'accession': unlist1(c1.findall(ns + "Accession")).text.strip(),
                        'description': unlist1(c1.findall(ns + "Description")).text.strip(),
                    for c1 in et.findall(ns + "Sample")

            # Remove common prefix from the description column
            df_meta = df_meta.assign(

            df_meta = df_meta.drop(columns='description')

        return df_meta
Exemplo n.º 13

    df_markers = df_markers.assign(v=1).pivot_table(
        index='gene', columns='celltype', values='v', fill_value=0,

    df_markers = df_markers.astype(int)

    return df_markers

if __name__ == '__main__':
    from tcga.utils import mkdir

    for (_, url) in URLS.items():
        json.dumps(download(url).now.meta, indent=2)

    df_meta = make_df_meta()
    df_meta = df_meta.merge(make_df_desc(), how="inner", on="gsm", suffixes=("", " (desc)"))
    df_meta = df_meta.set_index('sample', verify_integrity=True).sort_index()

    df_expr = make_df_expr()
    df_mrkr = make_df_markers()

    assert df_meta.index.equals(df_expr.index)

    df_meta.to_csv(src_dir / "meta.csv.gz", compression='gzip', sep='\t')
    df_expr.to_csv(src_dir / "expr.csv.gz", compression='gzip', sep='\t')
    df_mrkr.to_csv(src_dir / "mrkr.csv", sep='\t')

Exemplo n.º 14
import pandas as pd

from tcga.utils import download

URLS = {

out_dir = Path(__file__).with_suffix('')
download = download.to(abs_path=out_dir)

for (k, url) in URLS.items():

# with download(URLS['expr']).now.open() as fd:
#     df_expr_index = pd.read_csv(fd, sep=',', usecols=[0], index_col=0).index
#     assert (76533 == len(df_expr_index))
# with download(URLS['meta']).now.open() as fd:
#     df_meta_index = pd.read_csv(fd, sep=',', index_col=0).index
#     assert (df_expr_index.equals(df_meta_index[0:len(df_expr_index)]))

with download(URLS['expr']).now.open() as fd:
    df_expr = pd.read_csv(fd, sep=',', nrows=10, index_col=0).astype(int)
    assert (df_expr.shape == (len(df_expr), 50281))

with download(URLS['meta']).now.open() as fd:
    df_meta = pd.read_csv(fd, sep=',', index_col=0)
Exemplo n.º 15
# https://www.rdocumentation.org/packages/affy/versions/1.50.0/topics/expresso
# https://gist.github.com/numpde/772cd596fb5fe6036f7e29736bd1cf15

# Note:
# Potentially useful slides
# https://bioinformatics.mdanderson.org/MicroarrayCourse/Lectures/

import re, gzip
import pandas as pd
from tcga.utils import download

# Default download directory
download = download.to(rel_path="UV/download")

url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE60nnn/GSE60880/matrix/GSE60880_series_matrix.txt.gz"
with download(url).now.open(mode='rb') as gz:
    with gzip.open(gz, mode='r') as fd:
        sample_title = [
            re.findall(r'"([.\w]+)"', line)
            for line in fd.read().decode().splitlines()
            if line.lower().startswith("!sample_title")

    gz.seek(0)  # !
    df_expr = pd.read_csv(gz, compression="gzip", comment='!', sep='\t', index_col='ID_REF').sort_index()

    assert (len(sample_title) == len(df_expr.columns))
    df_expr.columns = sample_title

# Affymetrix platform info (affyID -> gene names, etc.)
Exemplo n.º 16

import numpy as np
import matplotlib.pyplot as plt

from inclusive import range
from plox import Plox
from tcga.utils import download
from pathlib import Path
from itertools import count

download = download.to(rel_path="cache/download")

# Reference [2]

def get_obs():
    rs = np.random.RandomState(1)

    # Number of hypothesis tests
    M = 10000

    mus1 = rs.normal(size=M)
    mus2 = mus1 + (np.arange(len(mus1)) > 0.9 * len(mus1))

    # Group sizes
    s1 = 25
    s2 = 25
Exemplo n.º 17
URLS = {
    'ctrl': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/control_v2.vcf.gz",
    'case': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/case_processed_v2.vcf.gz",

CACHE = Path(__file__).parent / "download_cache"
download = download.to(abs_path=CACHE)

HEAD = Path(__file__).parent / "head"
HEAD.mkdir(parents=True, exist_ok=True)

# Number of datalines for the `head` preview of VCF
N = 1000

for url in URLS.values():
    data = download(url).now

for k in URLS:
    data = download(URLS[k]).now
    head = HEAD / Path(data.meta['source']).name

    with ExitStack() as stack:
        src = stack.enter_context(data.open(mode='rb'))

            import gzip
            src = stack.enter_context(gzip.open(src))
            head = Path(str(head)[:-3])
Exemplo n.º 18
from tcga.utils import download
from tcga.strings import lines
from urllib.parse import urlencode, quote

download = download.to(rel_path="cache/download")

class param:
    genera = list(

    class urls:
        descriptors = "https://api.ncbi.nlm.nih.gov/datasets/v1alpha/assembly_descriptors/organism/"

for genus in param.genera:
    data = download(param.urls.descriptors + quote(genus) + "?" +
                        'returned_content': "COMPLETE",
                        'tax_exact_match': False

    if data.json:
        df = pd.DataFrame(data.json['datasets'])
        df = df.sort_values('display_name')
        print(F"{genus}, estimated genome size:",
Exemplo n.º 19
Arquivo: a_cov.py Projeto: numpde/cbb
def get_as_df(url, **csv_kwargs) -> pd.DataFrame:
    return pd.read_csv(io.StringIO(download(url).now.text),
Exemplo n.º 20
 def test_fail_bad_url(self):
     with TemporaryDirectory() as tempdir:
         with self.assertRaises(ValueError):
             x = download("-").to(abs_path=tempdir).now
         with self.assertRaises(URLError):
             x = download("http://").to(abs_path=tempdir).now
Exemplo n.º 21
 def test_rel_or_abs(self):
     with self.assertRaises(RuntimeError):
         download("-").to(rel_path="cache", abs_path="cache")
     with self.assertRaises(TypeError):
Exemplo n.º 22
 def test_fail_no_to(self):
     with self.assertRaises(RuntimeError):
         x = download("-").now
Exemplo n.º 23
 def test_silent_accept_bad_url(self):
     x = download("-")
Exemplo n.º 24
def get_pstg_seq() -> str:
    viroid_fasta = download(PARAM['viroid']).to(rel_path="cache/download").now.text
    pstg = SeqIO.read(io.StringIO(viroid_fasta), format='fasta')
    s = First(dna_to_dna).then(dna_to_rna)(pstg.seq)
    return s
Exemplo n.º 25
# RA, 2020-06-25

from pathlib import Path
from tempfile import gettempdir
from tcga.utils import download

download = download.to(abs_path=(Path(gettempdir()) / "tcga_download_cache"))
print("Will download to:", download.local_folder)
# Will download to: /tmp/tcga_download_cache

# Lambda phage genome
data = download("https://www.ebi.ac.uk/ena/browser/api/fasta/J02459.1").again(

print(data.meta)  # same as tcga.refs.annotations[data]
# {'source': 'https://www.ebi.ac.uk/ena/browser/api/fasta/J02459.1', 'datetime': '2020-06-25 07:18:52.065826+00:00'}

print(data.text[0:42], "...", data.text[330:350], "...")
# >ENA|J02459|J02459.1 Escherichia phage Lam ... CAGGGAATGCCCGTTCTGCG ...

# /tmp/tcga_download_cache/Z9tBKiJCqrfWuYy5BlgrA3zZAWav2CUd4xrPsya93Os=.zip

    from Bio import SeqIO
except ImportError:
    print("Need `biopython`")
    with data.open(mode='r') as fd:
        print(SeqIO.read(fd, format='fasta'))
# ID: ENA|J02459|J02459.1
Exemplo n.º 26
 def url2df(k):
     with download(URLS[k]).now.open(mode='rb') as fd:
         df = pandas.read_table(fd, compression='gzip', index_col=0).astype(int).sort_index(axis=1)
         df.columns = [f"{k}_{c}" for c in df.columns]
         return df
Exemplo n.º 27
# RA, 2020-06-27

from tcga.utils import download

url = "https://www.gsea-msigdb.org/gsea/msigdb/download_file.jsp?filePath=/msigdb/release/7.1/msigdb_v7.1_files_to_download_locally.zip"
Exemplo n.º 28
def make_df_expr_ucsc():
    import gzip
    with download(URLS['expr']).now.open(mode='rb') as fd:
        with gzip.open(fd, mode='rb') as gz:
            return pandas.read_table(gz, index_col=0)
Exemplo n.º 29
Arquivo: a_cov.py Projeto: numpde/cbb
def load_cov2() -> pd.DataFrame:
    import zipfile
    with zipfile.ZipFile(download(url=PARAM['GSE CoV2']).now.local_file,
                         mode='r') as zf:
        with zf.open("data") as fd:
            return pd.read_csv(fd, compression="gzip", sep='\t', index_col=0)
Exemplo n.º 30
    def maker_clinvar_clf() -> pandas.DataFrame:
        from idiva.db.clinvar import df_clinvar_to_clf_data
        # If you change this function, change the cache key also.
        # Preparing the clinvar dataframe for categorical classification:
        df_clinvar_reduced = df_clinvar[df_clinvar['CLNSIG'].isin({'Pathogenic', 'Benign'})]
        return df_clinvar_to_clf_data(df_clinvar_reduced, base_string_encoding=base_string_encoding)

    return cache_df(name="clinvar_clf_data", key=[base_string_encoding, "v01"], df_maker=maker_clinvar_clf)

if __name__ == '__main__':
    URLS = {
        'ctrl': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/control_v2.vcf.gz",
        'case': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/case_processed_v2.vcf.gz",

    from tcga.utils import download
    from pathlib import Path
    from contexttimer import Timer

    cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve()
    assert cache.is_dir()
    download = download.to(abs_path=cache)

    with download(URLS['ctrl']).now.open() as fd:
        with Timer() as timer:
            df = v0_df(idiva.io.ReadVCF(fd))

        print(F"This took {timer.elapsed} seconds")