Exemplo n.º 1
0
def test_here(tmp_path):
    # create dummy project
    f1 = tmp_path / "proj" / "src"
    f1.mkdir(parents=True, exist_ok=True)

    f2 = tmp_path / "proj" / "data"
    f2.mkdir(parents=True, exist_ok=True)

    # create a .here file at the project root
    tmp_path.joinpath("proj").joinpath(".here").touch()

    # create a fake data file
    f2.joinpath("data.csv").touch()

    # set working dir to the src directory
    if sys.version_info[0] == 2:
        chdir(str(f1))
    else:
        chdir(f1)

    herepath = here("data", "data.csv")

    assert herepath.resolve() == f2.joinpath("data.csv").resolve()
Exemplo n.º 2
0
import pandas as pd
import requests
import urllib.request
from pyhere import here

dataset = pd.read_csv(here('data', 'Uniprot', 'ZambeziensisUniprot.csv'), skiprows=[0], header=None, index_col=False)
print(len(dataset))

obsolete = []
row = 0

for i in dataset.itertuples():
    ## retrieve the sequences using Uniprot IDs from the Uniprot website
    uniprotID = dataset.iloc(axis=0)[row, 0]
    url = 'https://www.uniprot.org/uniprot/'+ uniprotID +'.fasta'
    row = row + 1\
    
    try:
        with urllib.request.urlopen(url) as url:

            ## read the content from the url and decode it
            page = url.read()
            seq = page.decode('utf8')

            ## open and write a new fasta file with all the sequences corresponding to the Uniprot IDs stored in the csv file
            with open(here('data', 'fasta', 'Rhipicephalus_zambeziensis.fasta'), 'a') as ffasta:
                ffasta.write(seq)

            if len(seq) == 0:
                obsolete.append(row)
    except:
 def dir_path(*args) -> PosixPath:
     if isinstance(dirname, str):
         return here(dirname, *args)
     else:
         return here(*dirname, *args)
Exemplo n.º 4
0
    # # columns = ['tum_lymfklieren_positief_atl']
    # X = X.loc[:, columns]
    # print(X.head(20))
    #
    # gen_cont = GeneralizeContinuous(n_bins=10, strategy='quantile', labeled_missing=[999])
    # # X = X.dropna()
    #
    # gen_cont.fit(X)
    # X_cat = gen_cont.transform(X)
    # print(X_cat)
    #
    # X_inv = gen_cont.inverse_transform(X_cat)
    # print(X_inv)
    # print(gen_cont.bin_edges_)

    data_path = here("examples/data/input/adult_9c.csv")
    df = pd.read_csv(data_path, delimiter=', ').astype(str)
    print(df.head())
    df_s = df[['native-country', 'occupation']]

    # epsilon = float(np.inf)
    epsilon = 0.1
    gen_cat = GeneralizeCategorical(epsilon=epsilon, n_bins=5)
    gen_cat.fit(df_s)
    df_sT = gen_cat.transform(df_s)
    df_sT = pd.DataFrame(df_sT, columns=df_s.columns)

    df_sI = pd.DataFrame(gen_cat.inverse_transform(df_sT),
                         columns=df_s.columns)

    visual.compare_value_counts(df_s, df_sI)
Exemplo n.º 5
0
import pandas as pd
import requests
import urllib.request
from pyhere import here

from Bio import Entrez

dataset = pd.read_csv(here('data', 'Uniprot', 'ScapularisNCBI.csv'),
                      skiprows=[0],
                      header=None,
                      index_col=False)
outFile = here('data', 'fasta', 'Ixodes_scapularis.fasta')
print(len(dataset))

obsolete = []
row = 0

for i in dataset.itertuples():
    ## retrieve the sequences using Uniprot IDs from the Uniprot website
    uniprotID = dataset.iloc(axis=0)[row, 0]
    row = row + 1\

    try:
        with Entrez.efetch(db='protein', id=uniprotID,
                           rettype='fasta') as handle:

            ## read the content from the Entrez database
            seq = handle.read()

            ## open and write a new fasta file with all the sequences corresponding to the Uniprot IDs stored in the csv file
            with open(outFile, 'a') as ffasta:
Exemplo n.º 6
0
 def dir_path(*args) -> PosixPath:
     if type(dirname) == str:
         return here(dirname, *args)
     else:
         return here(*dirname, *args)
import pandas as pd
import requests
import urllib.request
from pyhere import here

from Bio import Entrez

dataset = pd.read_csv(here('data', 'Uniprot', 'moubataUniprot.csv'),
                      skiprows=[0],
                      header=None,
                      index_col=False)
outFile = here('data', 'fasta', 'Ornithodoros_moubata2.fasta')
print(len(dataset))

obsolete = []
row = 0

for i in dataset.itertuples():
    ## retrieve the sequences using Uniprot IDs from the Uniprot website
    uniprotID = dataset.iloc(axis=0)[row, 0]
    url = 'https://www.uniprot.org/uniprot/' + uniprotID + '.fasta'
    row = row + 1\

    try:
        with urllib.request.urlopen(url) as url:

            ## read the content from the url and decode it
            page = url.read()
            seq = page.decode('utf8')

            ## open and write a new fasta file with all the sequences corresponding to the Uniprot IDs stored in the csv file