def test_here(tmp_path): # create dummy project f1 = tmp_path / "proj" / "src" f1.mkdir(parents=True, exist_ok=True) f2 = tmp_path / "proj" / "data" f2.mkdir(parents=True, exist_ok=True) # create a .here file at the project root tmp_path.joinpath("proj").joinpath(".here").touch() # create a fake data file f2.joinpath("data.csv").touch() # set working dir to the src directory if sys.version_info[0] == 2: chdir(str(f1)) else: chdir(f1) herepath = here("data", "data.csv") assert herepath.resolve() == f2.joinpath("data.csv").resolve()
import pandas as pd import requests import urllib.request from pyhere import here dataset = pd.read_csv(here('data', 'Uniprot', 'ZambeziensisUniprot.csv'), skiprows=[0], header=None, index_col=False) print(len(dataset)) obsolete = [] row = 0 for i in dataset.itertuples(): ## retrieve the sequences using Uniprot IDs from the Uniprot website uniprotID = dataset.iloc(axis=0)[row, 0] url = 'https://www.uniprot.org/uniprot/'+ uniprotID +'.fasta' row = row + 1\ try: with urllib.request.urlopen(url) as url: ## read the content from the url and decode it page = url.read() seq = page.decode('utf8') ## open and write a new fasta file with all the sequences corresponding to the Uniprot IDs stored in the csv file with open(here('data', 'fasta', 'Rhipicephalus_zambeziensis.fasta'), 'a') as ffasta: ffasta.write(seq) if len(seq) == 0: obsolete.append(row) except:
def dir_path(*args) -> PosixPath: if isinstance(dirname, str): return here(dirname, *args) else: return here(*dirname, *args)
# # columns = ['tum_lymfklieren_positief_atl'] # X = X.loc[:, columns] # print(X.head(20)) # # gen_cont = GeneralizeContinuous(n_bins=10, strategy='quantile', labeled_missing=[999]) # # X = X.dropna() # # gen_cont.fit(X) # X_cat = gen_cont.transform(X) # print(X_cat) # # X_inv = gen_cont.inverse_transform(X_cat) # print(X_inv) # print(gen_cont.bin_edges_) data_path = here("examples/data/input/adult_9c.csv") df = pd.read_csv(data_path, delimiter=', ').astype(str) print(df.head()) df_s = df[['native-country', 'occupation']] # epsilon = float(np.inf) epsilon = 0.1 gen_cat = GeneralizeCategorical(epsilon=epsilon, n_bins=5) gen_cat.fit(df_s) df_sT = gen_cat.transform(df_s) df_sT = pd.DataFrame(df_sT, columns=df_s.columns) df_sI = pd.DataFrame(gen_cat.inverse_transform(df_sT), columns=df_s.columns) visual.compare_value_counts(df_s, df_sI)
import pandas as pd import requests import urllib.request from pyhere import here from Bio import Entrez dataset = pd.read_csv(here('data', 'Uniprot', 'ScapularisNCBI.csv'), skiprows=[0], header=None, index_col=False) outFile = here('data', 'fasta', 'Ixodes_scapularis.fasta') print(len(dataset)) obsolete = [] row = 0 for i in dataset.itertuples(): ## retrieve the sequences using Uniprot IDs from the Uniprot website uniprotID = dataset.iloc(axis=0)[row, 0] row = row + 1\ try: with Entrez.efetch(db='protein', id=uniprotID, rettype='fasta') as handle: ## read the content from the Entrez database seq = handle.read() ## open and write a new fasta file with all the sequences corresponding to the Uniprot IDs stored in the csv file with open(outFile, 'a') as ffasta:
def dir_path(*args) -> PosixPath: if type(dirname) == str: return here(dirname, *args) else: return here(*dirname, *args)
import pandas as pd import requests import urllib.request from pyhere import here from Bio import Entrez dataset = pd.read_csv(here('data', 'Uniprot', 'moubataUniprot.csv'), skiprows=[0], header=None, index_col=False) outFile = here('data', 'fasta', 'Ornithodoros_moubata2.fasta') print(len(dataset)) obsolete = [] row = 0 for i in dataset.itertuples(): ## retrieve the sequences using Uniprot IDs from the Uniprot website uniprotID = dataset.iloc(axis=0)[row, 0] url = 'https://www.uniprot.org/uniprot/' + uniprotID + '.fasta' row = row + 1\ try: with urllib.request.urlopen(url) as url: ## read the content from the url and decode it page = url.read() seq = page.decode('utf8') ## open and write a new fasta file with all the sequences corresponding to the Uniprot IDs stored in the csv file