Exemplo n.º 1
0
Arquivo: test1.py Projeto: numpde/cbb
    class DATA:
        I = download(
            "ftp://ftp.ensembl.org/pub/release-100/fasta/caenorhabditis_elegans/"
            "dna/Caenorhabditis_elegans.WBcel235.dna.chromosome.I.fa.gz").now

        cDNA = download(
            "ftp://ftp.ensembl.org/pub/release-100/fasta/caenorhabditis_elegans/"
            "cdna/Caenorhabditis_elegans.WBcel235.cdna.all.fa.gz").now
 def test_check_all(self):
     for group in URLS:
         with download(URLS[group]).now.open(mode='rb') as fd:
             with open_maybe_gz(fd) as fd:
                 from idiva.io.ass import check_all
                 for check in check_all(fd):
                     print(group, check)
Exemplo n.º 3
0
def make_df_meta_ucsc():
    with download(URLS['meta']).now.open(mode='r') as fd:
        df = pandas.read_table(fd, index_col=0)
        df = df.assign(celltype=df.Renamed_clusternames)
        p = re.compile(r"_([0-9]+)[.]tab[.]([A-Z][0-9]+)")
        df.index = ["SS2_16_{}_{}".format(*tcga.utils.unlist1(p.findall(i))) for i in df.index]
        return df
Exemplo n.º 4
0
 def test_makes_folder(self):
     with TemporaryDirectory() as tempdir:
         folder = Path(tempdir) / "test"
         with self.assertRaises(ValueError):
             x = download("-").to(abs_path=folder).now
         self.assertTrue(folder.exists())
     self.assertTrue(not folder.exists())
Exemplo n.º 5
0
class param:
    genera = list(
        lines(
            download(
                "https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/master/"
                "Assignments/NCBIEDirectAssignment/genera.txt").now.text))

    class urls:
        descriptors = "https://api.ncbi.nlm.nih.gov/datasets/v1alpha/assembly_descriptors/organism/"
Exemplo n.º 6
0
def make_df_desc() -> pandas.DataFrame:
    import warnings
    with download(URLS['sc_description']).now.open(mode='rb') as fd:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            values = openpyxl.load_workbook(fd).active.values
        # Note: order of arguments matters
        df = pandas.DataFrame(columns=next(values), data=list(values))
        df = df.rename(columns={'GSM ID': "gsm", 'annoated cell types': "celltype"})
        return df
 def test_makes_df_ctrl(self):
     # This takes about 20min on v1/v2
     from idiva.clf.df import v0_df
     from idiva.io import ReadVCF
     with download(URLS['ctrl']).now.open(mode='rb') as fd:
         with open_maybe_gz(fd) as fd:
             assert isinstance(fd, io.TextIOBase)
             df = v0_df(ReadVCF(fd))
             self.assertTrue(len(df) > 0)
             self.assertEqual(len(df), ref_len_v2['ctrl'])
Exemplo n.º 8
0
 def test_count(self):
     from idiva.io.vcf import ReadVCF, RawDataline
     # ref_len_v1 = {'ctrl': 2329288, 'case': 2360972}
     ref_len_v2 = {'ctrl': 2227080, 'case': 2258797}
     for group in URLS:
         with download(URLS[group]).now.open(mode='rb') as fd:
             with open_maybe_gz(fd, mode='r') as fd:
                 assert isinstance(fd, io.TextIOBase)
                 nlines = sum(1 for __ in ReadVCF(fd))
                 # print(F"Group {group} has {nlines} datalines")
                 self.assertEqual(nlines, ref_len_v2[group])
    def test_combines(self):
        from idiva.io import ReadVCF
        from idiva.clf.df import v0_df, join
        dfs = {}

        for k in URLS:
            with download(URLS[k]).now.open(mode='rb') as fd:
                with open_maybe_gz(fd) as fd:
                    assert isinstance(fd, io.TextIOBase)
                    dfs[k] = v0_df(ReadVCF(fd))

        df = join(case=dfs['case'], ctrl=dfs['ctrl'])
Exemplo n.º 10
0
def make_df_expr() -> pandas.DataFrame:
    with download(URLS['GSE98816']).now.open(mode='rb') as fd:
        # samples x genes
        df_expr = pandas.read_table(fd, compression='gzip', quotechar='"', index_col=0).T

    # Sort by sample ID
    df_expr = df_expr.sort_index()
    df_expr.index.name = "sample"

    assert df_expr.index.is_unique

    # Also remove common prefix to match df_meta
    df_expr.index = df_expr.index.str.slice(len(os.path.commonprefix(list(df_expr.index))))
    assert df_expr.index.is_unique
    return df_expr
    def test_chi2_large(self):
        from idiva.io import ReadVCF, open_maybe_gz
        from idiva.clf.df import v0_df, join

        dfs = {}

        for k in URLS_LARGE:
            with download(URLS_LARGE[k]).now.open(mode='rb') as fd:
                with open_maybe_gz(fd, mode='r') as fd:
                    dfs[k] = v0_df(ReadVCF(fd))

        df = join(case=dfs['case'], ctrl=dfs['ctrl'])

        cols = tuple([F"ALT{n}_{kind}" for n in range(3)]
                     for kind in ['case', 'ctrl'])

        p = chi2_test(df[cols[0] + cols[1]], cols, add=1)
Exemplo n.º 12
0
def make_df_meta() -> pandas.DataFrame:
    import xml.etree.ElementTree as ET
    import tarfile
    import re
    import pandas as pd
    from tcga.utils import unlist1

    with download(URLS['GSE98816_miniml']).now.open(mode='rb') as tf:
        with tarfile.open(fileobj=tf, mode='r') as tar:
            et = ET.parse(source=tar.extractfile(unlist1(tar))).getroot()

            # Namespace a la '{http://www.ncbi.nlm.nih.gov/geo/info/MINiML}'
            ns = unlist1(re.findall(r"({.*}).*", et.tag))

            c1: ET.Element
            # c1 = first(et.findall(ns + "Sample"))
            df_meta = pd.DataFrame(
                data=(
                    {
                        'gsm': c1.attrib['iid'],
                        'sra': unlist1(c1.findall("./*/[@type='SRA']")).attrib["target"].strip(),
                        'taxid': unlist1(c1.findall("*/*/[@taxid]")).attrib["taxid"].strip(),
                        'biosample': unlist1(c1.findall("./*/[@type='BioSample']")).attrib["target"].strip(),
                        'strain': unlist1(c1.findall("*/*/[@tag='strain']")).text.strip().lower(),
                        'tissue': unlist1(c1.findall("*/*/[@tag='tissue']")).text.strip().lower(),
                        'genotype': unlist1(c1.findall("*/*/[@tag='genotype']")).text.strip().lower(),
                        'age': unlist1(c1.findall("*/*/[@tag='age']")).text.strip().lower(),
                        'title': unlist1(c1.findall(ns + "Title")).text.strip(),
                        'accession': unlist1(c1.findall(ns + "Accession")).text.strip(),
                        'description': unlist1(c1.findall(ns + "Description")).text.strip(),
                    }
                    for c1 in et.findall(ns + "Sample")
                )
            )

            # Remove common prefix from the description column
            df_meta = df_meta.assign(
                sample=df_meta.description.str.slice(len(os.path.commonprefix(list(df_meta.description))))
            )

            df_meta = df_meta.drop(columns='description')

        return df_meta
Exemplo n.º 13
0
    )

    df_markers = df_markers.assign(v=1).pivot_table(
        index='gene', columns='celltype', values='v', fill_value=0,
    )

    df_markers = df_markers.astype(int)

    return df_markers


if __name__ == '__main__':
    from tcga.utils import mkdir

    for (_, url) in URLS.items():
        json.dumps(download(url).now.meta, indent=2)

    df_meta = make_df_meta()
    df_meta = df_meta.merge(make_df_desc(), how="inner", on="gsm", suffixes=("", " (desc)"))
    df_meta = df_meta.set_index('sample', verify_integrity=True).sort_index()

    df_expr = make_df_expr()
    df_mrkr = make_df_markers()

    assert df_meta.index.equals(df_expr.index)

    df_meta.to_csv(src_dir / "meta.csv.gz", compression='gzip', sep='\t')
    df_expr.to_csv(src_dir / "expr.csv.gz", compression='gzip', sep='\t')
    df_mrkr.to_csv(src_dir / "mrkr.csv", sep='\t')

#
Exemplo n.º 14
0
import pandas as pd

from tcga.utils import download

URLS = {
    'expr':
    "https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_human_ctx_smart-seq/matrix.csv",
    'meta':
    "https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_human_ctx_smart-seq/metadata.csv",
}

out_dir = Path(__file__).with_suffix('')
download = download.to(abs_path=out_dir)

for (k, url) in URLS.items():
    (download(url).now.meta)

# with download(URLS['expr']).now.open() as fd:
#     df_expr_index = pd.read_csv(fd, sep=',', usecols=[0], index_col=0).index
#     assert (76533 == len(df_expr_index))
#
# with download(URLS['meta']).now.open() as fd:
#     df_meta_index = pd.read_csv(fd, sep=',', index_col=0).index
#     assert (df_expr_index.equals(df_meta_index[0:len(df_expr_index)]))

with download(URLS['expr']).now.open() as fd:
    df_expr = pd.read_csv(fd, sep=',', nrows=10, index_col=0).astype(int)
    assert (df_expr.shape == (len(df_expr), 50281))

with download(URLS['meta']).now.open() as fd:
    df_meta = pd.read_csv(fd, sep=',', index_col=0)
Exemplo n.º 15
0
# https://www.rdocumentation.org/packages/affy/versions/1.50.0/topics/expresso
# https://gist.github.com/numpde/772cd596fb5fe6036f7e29736bd1cf15

# Note:
# Potentially useful slides
# https://bioinformatics.mdanderson.org/MicroarrayCourse/Lectures/

import re, gzip
import pandas as pd
from tcga.utils import download

# Default download directory
download = download.to(rel_path="UV/download")

url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE60nnn/GSE60880/matrix/GSE60880_series_matrix.txt.gz"
with download(url).now.open(mode='rb') as gz:
    gz.seek(0)
    with gzip.open(gz, mode='r') as fd:
        sample_title = [
            re.findall(r'"([.\w]+)"', line)
            for line in fd.read().decode().splitlines()
            if line.lower().startswith("!sample_title")
        ].pop()

    gz.seek(0)  # !
    df_expr = pd.read_csv(gz, compression="gzip", comment='!', sep='\t', index_col='ID_REF').sort_index()

    assert (len(sample_title) == len(df_expr.columns))
    df_expr.columns = sample_title

# Affymetrix platform info (affyID -> gene names, etc.)
Exemplo n.º 16
0
"""

import numpy as np
import matplotlib.pyplot as plt

from inclusive import range
from plox import Plox
from tcga.utils import download
from pathlib import Path
from itertools import count

download = download.to(rel_path="cache/download")

# Reference [2]
download(
    "https://cpb-us-w2.wpmucdn.com/blog.nus.edu.sg/dist/0/3425/files/2018/10/Understanding-Benjamini-Hochberg-method-2ijolq0.pdf"
).now


def get_obs():
    rs = np.random.RandomState(1)

    # Number of hypothesis tests
    M = 10000

    mus1 = rs.normal(size=M)
    mus2 = mus1 + (np.arange(len(mus1)) > 0.9 * len(mus1))

    # Group sizes
    s1 = 25
    s2 = 25
Exemplo n.º 17
0
URLS = {
    'ctrl': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/control_v2.vcf.gz",
    'case': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/case_processed_v2.vcf.gz",
}

CACHE = Path(__file__).parent / "download_cache"
download = download.to(abs_path=CACHE)

HEAD = Path(__file__).parent / "head"
HEAD.mkdir(parents=True, exist_ok=True)

# Number of datalines for the `head` preview of VCF
N = 1000

for url in URLS.values():
    data = download(url).now

for k in URLS:
    data = download(URLS[k]).now
    head = HEAD / Path(data.meta['source']).name

    with ExitStack() as stack:
        src = stack.enter_context(data.open(mode='rb'))

        try:
            import gzip
            src = stack.enter_context(gzip.open(src))
        except:
            raise
        else:
            head = Path(str(head)[:-3])
Exemplo n.º 18
0
from tcga.utils import download
from tcga.strings import lines
from urllib.parse import urlencode, quote

download = download.to(rel_path="cache/download")


class param:
    genera = list(
        lines(
            download(
                "https://raw.githubusercontent.com/linsalrob/ComputationalGenomicsManual/master/"
                "Assignments/NCBIEDirectAssignment/genera.txt").now.text))

    class urls:
        descriptors = "https://api.ncbi.nlm.nih.gov/datasets/v1alpha/assembly_descriptors/organism/"


for genus in param.genera:
    data = download(param.urls.descriptors + quote(genus) + "?" +
                    urlencode({
                        'returned_content': "COMPLETE",
                        'tax_exact_match': False
                    })).now

    if data.json:
        df = pd.DataFrame(data.json['datasets'])
        df = df.sort_values('display_name')
        print(F"{genus}, estimated genome size:",
              list(df.estimated_size.astype(int)))
Exemplo n.º 19
0
Arquivo: a_cov.py Projeto: numpde/cbb
def get_as_df(url, **csv_kwargs) -> pd.DataFrame:
    return pd.read_csv(io.StringIO(download(url).now.text),
                       sep='\t',
                       **csv_kwargs)
Exemplo n.º 20
0
 def test_fail_bad_url(self):
     with TemporaryDirectory() as tempdir:
         with self.assertRaises(ValueError):
             x = download("-").to(abs_path=tempdir).now
         with self.assertRaises(URLError):
             x = download("http://").to(abs_path=tempdir).now
Exemplo n.º 21
0
 def test_rel_or_abs(self):
     with self.assertRaises(RuntimeError):
         download("-").to(rel_path="cache", abs_path="cache")
     with self.assertRaises(TypeError):
         download("-").to("cache")
Exemplo n.º 22
0
 def test_fail_no_to(self):
     with self.assertRaises(RuntimeError):
         x = download("-").now
Exemplo n.º 23
0
 def test_silent_accept_bad_url(self):
     x = download("-")
Exemplo n.º 24
0
def get_pstg_seq() -> str:
    viroid_fasta = download(PARAM['viroid']).to(rel_path="cache/download").now.text
    pstg = SeqIO.read(io.StringIO(viroid_fasta), format='fasta')
    s = First(dna_to_dna).then(dna_to_rna)(pstg.seq)
    return s
Exemplo n.º 25
0
# RA, 2020-06-25

from pathlib import Path
from tempfile import gettempdir
from tcga.utils import download

download = download.to(abs_path=(Path(gettempdir()) / "tcga_download_cache"))
print("Will download to:", download.local_folder)
# Will download to: /tmp/tcga_download_cache

# Lambda phage genome
data = download("https://www.ebi.ac.uk/ena/browser/api/fasta/J02459.1").again(
    False).now

print(data.meta)  # same as tcga.refs.annotations[data]
# {'source': 'https://www.ebi.ac.uk/ena/browser/api/fasta/J02459.1', 'datetime': '2020-06-25 07:18:52.065826+00:00'}

print(data.text[0:42], "...", data.text[330:350], "...")
# >ENA|J02459|J02459.1 Escherichia phage Lam ... CAGGGAATGCCCGTTCTGCG ...

print(data.local_file)
# /tmp/tcga_download_cache/Z9tBKiJCqrfWuYy5BlgrA3zZAWav2CUd4xrPsya93Os=.zip

try:
    from Bio import SeqIO
except ImportError:
    print("Need `biopython`")
else:
    with data.open(mode='r') as fd:
        print(SeqIO.read(fd, format='fasta'))
# ID: ENA|J02459|J02459.1
Exemplo n.º 26
0
 def url2df(k):
     with download(URLS[k]).now.open(mode='rb') as fd:
         df = pandas.read_table(fd, compression='gzip', index_col=0).astype(int).sort_index(axis=1)
         df.columns = [f"{k}_{c}" for c in df.columns]
         return df
Exemplo n.º 27
0
# RA, 2020-06-27

from tcga.utils import download

url = "https://www.gsea-msigdb.org/gsea/msigdb/download_file.jsp?filePath=/msigdb/release/7.1/msigdb_v7.1_files_to_download_locally.zip"
download(url).to(rel_path="original").now
Exemplo n.º 28
0
def make_df_expr_ucsc():
    import gzip
    with download(URLS['expr']).now.open(mode='rb') as fd:
        with gzip.open(fd, mode='rb') as gz:
            return pandas.read_table(gz, index_col=0)
Exemplo n.º 29
0
Arquivo: a_cov.py Projeto: numpde/cbb
def load_cov2() -> pd.DataFrame:
    import zipfile
    with zipfile.ZipFile(download(url=PARAM['GSE CoV2']).now.local_file,
                         mode='r') as zf:
        with zf.open("data") as fd:
            return pd.read_csv(fd, compression="gzip", sep='\t', index_col=0)
Exemplo n.º 30
0
    def maker_clinvar_clf() -> pandas.DataFrame:
        from idiva.db.clinvar import df_clinvar_to_clf_data
        # If you change this function, change the cache key also.
        # Preparing the clinvar dataframe for categorical classification:
        df_clinvar_reduced = df_clinvar[df_clinvar['CLNSIG'].isin({'Pathogenic', 'Benign'})]
        return df_clinvar_to_clf_data(df_clinvar_reduced, base_string_encoding=base_string_encoding)

    return cache_df(name="clinvar_clf_data", key=[base_string_encoding, "v01"], df_maker=maker_clinvar_clf)


if __name__ == '__main__':
    URLS = {
        'ctrl': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/control_v2.vcf.gz",
        'case': "https://public.bmi.inf.ethz.ch/eth_intern/teaching/cbm_2020/cbm_2020_project2/case_processed_v2.vcf.gz",
    }

    from tcga.utils import download
    from pathlib import Path
    from contexttimer import Timer

    cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve()
    assert cache.is_dir()
    download = download.to(abs_path=cache)

    with download(URLS['ctrl']).now.open() as fd:
        with Timer() as timer:
            df = v0_df(idiva.io.ReadVCF(fd))

        print(F"This took {timer.elapsed} seconds")