def main(): global mk_model global outdir parser = argparse.ArgumentParser( description="""Generate tables and figures from 'dna2vec: Consistent vector representations of variable-length k-mers'""") parser.add_argument( '-n', '--num-samples', dest='n_samples', default=1000, nargs=1, help= 'Number of samples for arithmetic experiments (tables 1 and 2, figure 3)' ) parser.add_argument('-e', '--embedding', dest='embed_to_read', nargs=1, help='Output directory') parser.add_argument('-o', '--output-dir', dest='outdir', nargs=1, help='Output directory') args = parser.parse_args() n_samples = int(args.n_samples[0]) embed_to_read = args.embed_to_read[0] outdir = args.outdir[0] if outdir not in os.listdir(): os.mkdir(outdir) # Used previously # outdir = 'epoch1' # embed_to_read = '/data/mwiest/dna2vec-20200825-2123-k3to8-100d-10c-32980Mbp-sliding-9bf_epoch2.w2v' # n_samples = 1000 mk_model = MultiKModel(embed_to_read) print('Generating Figure 1...') generate_figure1(num_mers=10000) print('Generating Table 1 (slow)...') generate_table1(n_samples=n_samples) print('Generating Table 2 (very slow)...') generate_table2(n_samples=n_samples) print('Generating Figure 3 (slow)...') generate_figure3(n_samples=n_samples) print('... Done!')
def mk_model(): filepath = 'pretrained/dna2vec-20161219-0153-k3to8-100d-10c-29320Mbp-sliding-Xat.w2v' return MultiKModel(filepath)
from dna2vec.multi_k_model import MultiKModel import pandas as pd import numpy as np from keras.models import Sequential from keras.layers import Dense, Dropout, Activation from keras.layers import Embedding from keras.layers import LSTM import csv filepath = 'pretrained/dna2vec-20161219-0153-k3to8-100d-10c-29320Mbp-sliding-Xat.w2v' mk_model = MultiKModel(filepath) df = pd.read_csv('train.csv') vec8 = [] vec6 = [] print("Converting sequence to vectors") for d in df['sequence']: str8 = (d[:8]) str6 = (d[8:]) vec8.append(mk_model.vector(str8)) vec6.append(mk_model.vector(str6)) X = np.zeros((2000, 2, 100)) Y = df['label'] for i in range(0, 2000): for j in range(0, 2): if (j == 0): X[i][j] = vec8[i] else: X[i][j] = vec6[i]
from dna2vec.multi_k_model import MultiKModel import numpy as np data = "./data/yeastIST.fasta" filepath = './dna2vec_main/pretrained/dna2vec-20161219-0153-k3to8-100d-10c-29320Mbp-sliding-Xat.w2v' mk_model = MultiKModel(filepath) k = 4 file_index = 0 specialFastaChars = {} specialFastaChars["R"] = "G" specialFastaChars["Y"] = "G" specialFastaChars["K"] = "T" specialFastaChars["M"] = "A" specialFastaChars["S"] = "C" specialFastaChars["W"] = "A" specialFastaChars["B"] = "C" specialFastaChars["D"] = "T" specialFastaChars["H"] = "A" specialFastaChars["N"] = "T" specialFastaChars["V"] = "G" specialFastaChars["U"] = "C" identifier = "" result = open("./results/other.txt", "w+") with open(data, "r+") as file_data: arrSpecial = specialFastaChars.keys() for line in file_data: if line[0] == ">": identifier = line
from dna2vec.multi_k_model import MultiKModel # for converting short-read DNA sequences into vectorized reads filepath = '../../pretrained/dna2vec-20161219-0153-k3to8-100d-10c-29320Mbp-sliding-Xat.w2v' #path to file with vectorized k-mer representations mk_model = MultiKModel(filepath) # instantiating model for fetching vectorized k-mer representations def kmer_frequency(sequence, start, end, data): """ Calculate kmer frequency for k-mers of length i-j Keyword arguments: sequence -- DNA record sequence (string) start -- min k-mer length (int) end -- max k-mer length (int) data -- empty dictionary to populate with k-mer frequency data (dict) """ for counter in range(start, end + 1): for kmer in generate_kmers(sequence, counter): # print(kmer) if kmer in data: data[kmer] = data[kmer] + 1 else: data[kmer] = 1 return data def generate_kmers(sequence, length): """ Generate k-mers of length i-j""" for kmer in range(0, len(sequence) - length): yield sequence[kmer:kmer + length]
pickled_model_file = 'models/refseq_training_vec_k3to8_160K_samples_model.p' args = vars(parser.parse_args()) samples = int(args['samples']) print 'Using %s samples...' % samples filepath = args['file'] if not os.path.exists(filepath): 'dna2vec model file does not exist: ' + filepath sys.exit(1) print 'Using dna2vec model: ' + filepath mk_model = MultiKModel(filepath) herb_seqs = [] with open('bacmet_contaminated_oryza_sequences.csv', 'rb') as csvfile: herb_reader = csvfile.readlines() #(csvfile, delimiter=' ', quotechar='|') for row in herb_reader[1:]: r = row.split(',') r.pop(0) r = [i.rstrip() for i in r] for i in r:
from dna2vec.multi_k_model import MultiKModel filepath = 'pretrained/dna2vec-20161219-0153-k3to8-100d-10c-29320Mbp-sliding-Xat.w2v' mk_model = MultiKModel(filepath) print(len(mk_model.vector('AAAAAAAA')))
results = {} #print("build iterval--> start") #while co >= 0 : # endinterval = start + step # interval_dict[i] = [start,endinterval] # start = endinterval # co -= 1 # i +=1 #print("build iterval--> end") fastafiles_dict = {} outputfile = 'vectorisation_results/vectorisation_results.csv' #outputfile = 'metagenomics_signatures_1-8_CAMI_from_genome_means.csv' mk_model = MultiKModel('dna2vec_1-8_all.w2v') files = list() for (dirpath, dirnames, filenames) in os.walk(sys.argv[1]): files += [os.path.join(dirpath, file) for file in filenames] k = 1 for file in files: #print(file) basename = os.path.basename(file) for seq_record in SeqIO.parse(file, "fasta"): print(seq_record.id) full_sequence = re.sub('[^GATC]', "", str(seq_record.seq.ungap(' ')).upper()) sumvect = np.zeros((100, ), dtype=int) kmer = 8
from joblib import Parallel, delayed import multiprocessing from generate_hdf5 import ReadWrite import time import glob from dna2vec.multi_k_model import MultiKModel parser = argparse.ArgumentParser() parser.add_argument("--genome_file_path", help="Directory of FASTA file") parser.add_argument("--dna2vec_file_path", help="Directory where .w2v file is saved (from dna2vec)") args = parser.parse_args() num_cores = multiprocessing.cpu_count() read_write = ReadWrite() mk_model = MultiKModel(args.dna2vec_file_path) fasta_files = args.genome_file_path + "*.fa" def embedding(seq): # seq = gene_seq[gene] embed_len = (len(seq) - 1) // 3 gene_vec = [[0] * 100] * embed_len for i in range(embed_len): gene_vec[i] = mk_model.vector(seq[i * 3:i * 3 + 3]) return (gene_vec, embed_len)