Пример #1
0
def main():

    global mk_model
    global outdir

    parser = argparse.ArgumentParser(
        description="""Generate tables and figures from 'dna2vec: Consistent 
    vector representations of variable-length k-mers'""")

    parser.add_argument(
        '-n',
        '--num-samples',
        dest='n_samples',
        default=1000,
        nargs=1,
        help=
        'Number of samples for arithmetic experiments (tables 1 and 2, figure 3)'
    )
    parser.add_argument('-e',
                        '--embedding',
                        dest='embed_to_read',
                        nargs=1,
                        help='Output directory')
    parser.add_argument('-o',
                        '--output-dir',
                        dest='outdir',
                        nargs=1,
                        help='Output directory')

    args = parser.parse_args()
    n_samples = int(args.n_samples[0])
    embed_to_read = args.embed_to_read[0]
    outdir = args.outdir[0]

    if outdir not in os.listdir():
        os.mkdir(outdir)

    # Used previously
    # outdir = 'epoch1'
    # embed_to_read = '/data/mwiest/dna2vec-20200825-2123-k3to8-100d-10c-32980Mbp-sliding-9bf_epoch2.w2v'
    # n_samples = 1000

    mk_model = MultiKModel(embed_to_read)

    print('Generating Figure 1...')
    generate_figure1(num_mers=10000)
    print('Generating Table 1 (slow)...')
    generate_table1(n_samples=n_samples)
    print('Generating Table 2 (very slow)...')
    generate_table2(n_samples=n_samples)
    print('Generating Figure 3 (slow)...')
    generate_figure3(n_samples=n_samples)
    print('... Done!')
Пример #2
0
def mk_model():
    filepath = 'pretrained/dna2vec-20161219-0153-k3to8-100d-10c-29320Mbp-sliding-Xat.w2v'
    return MultiKModel(filepath)
Пример #3
0
from dna2vec.multi_k_model import MultiKModel
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM
import csv

filepath = 'pretrained/dna2vec-20161219-0153-k3to8-100d-10c-29320Mbp-sliding-Xat.w2v'
mk_model = MultiKModel(filepath)

df = pd.read_csv('train.csv')
vec8 = []
vec6 = []
print("Converting sequence to vectors")
for d in df['sequence']:
    str8 = (d[:8])
    str6 = (d[8:])
    vec8.append(mk_model.vector(str8))
    vec6.append(mk_model.vector(str6))

X = np.zeros((2000, 2, 100))
Y = df['label']

for i in range(0, 2000):
    for j in range(0, 2):
        if (j == 0):
            X[i][j] = vec8[i]
        else:
            X[i][j] = vec6[i]
Пример #4
0
from dna2vec.multi_k_model import MultiKModel
import numpy as np
data = "./data/yeastIST.fasta"
filepath = './dna2vec_main/pretrained/dna2vec-20161219-0153-k3to8-100d-10c-29320Mbp-sliding-Xat.w2v'
mk_model = MultiKModel(filepath)
k = 4
file_index = 0

specialFastaChars = {}
specialFastaChars["R"] = "G"
specialFastaChars["Y"] = "G"
specialFastaChars["K"] = "T"
specialFastaChars["M"] = "A"
specialFastaChars["S"] = "C"
specialFastaChars["W"] = "A"
specialFastaChars["B"] = "C"
specialFastaChars["D"] = "T"
specialFastaChars["H"] = "A"
specialFastaChars["N"] = "T"
specialFastaChars["V"] = "G"
specialFastaChars["U"] = "C"

identifier = ""

result = open("./results/other.txt", "w+")

with open(data, "r+") as file_data:
    arrSpecial = specialFastaChars.keys()
    for line in file_data:
        if line[0] == ">":
            identifier = line
Пример #5
0
from dna2vec.multi_k_model import MultiKModel # for converting short-read DNA sequences into vectorized reads

filepath = '../../pretrained/dna2vec-20161219-0153-k3to8-100d-10c-29320Mbp-sliding-Xat.w2v' #path to file with vectorized k-mer representations
mk_model = MultiKModel(filepath) # instantiating model for fetching vectorized k-mer representations

def kmer_frequency(sequence, start, end, data):
    """
	Calculate kmer frequency for k-mers of length i-j

	Keyword arguments:
	sequence -- DNA record sequence (string)
	start -- min k-mer length (int)
	end -- max k-mer length (int)
	data -- empty dictionary to populate with k-mer frequency data (dict)
    """
    for counter in range(start, end + 1):
        for kmer in generate_kmers(sequence, counter):
            # print(kmer)
            if kmer in data:
                data[kmer] = data[kmer] + 1
            else:
                data[kmer] = 1
    return data


def generate_kmers(sequence, length):
    """ Generate k-mers of length i-j"""
    for kmer in range(0, len(sequence) - length):
         yield sequence[kmer:kmer + length]

pickled_model_file = 'models/refseq_training_vec_k3to8_160K_samples_model.p'

args = vars(parser.parse_args())

samples = int(args['samples'])

print 'Using %s samples...' % samples

filepath = args['file']

if not os.path.exists(filepath):
    'dna2vec model file does not exist: ' + filepath
    sys.exit(1)
print 'Using dna2vec model: ' + filepath

mk_model = MultiKModel(filepath)

herb_seqs = []

with open('bacmet_contaminated_oryza_sequences.csv', 'rb') as csvfile:

    herb_reader = csvfile.readlines()  #(csvfile, delimiter=' ', quotechar='|')

    for row in herb_reader[1:]:

        r = row.split(',')
        r.pop(0)

        r = [i.rstrip() for i in r]

        for i in r:
Пример #7
0
from dna2vec.multi_k_model import MultiKModel

filepath = 'pretrained/dna2vec-20161219-0153-k3to8-100d-10c-29320Mbp-sliding-Xat.w2v'
mk_model = MultiKModel(filepath)

print(len(mk_model.vector('AAAAAAAA')))
Пример #8
0
results = {}

#print("build iterval--> start")
#while co >= 0 :
#   endinterval = start + step
#   interval_dict[i] = [start,endinterval]
#   start = endinterval
#   co -= 1
#   i +=1

#print("build iterval--> end")

fastafiles_dict = {}
outputfile = 'vectorisation_results/vectorisation_results.csv'
#outputfile = 'metagenomics_signatures_1-8_CAMI_from_genome_means.csv'
mk_model = MultiKModel('dna2vec_1-8_all.w2v')

files = list()
for (dirpath, dirnames, filenames) in os.walk(sys.argv[1]):
    files += [os.path.join(dirpath, file) for file in filenames]
k = 1

for file in files:
    #print(file)
    basename = os.path.basename(file)
    for seq_record in SeqIO.parse(file, "fasta"):
        print(seq_record.id)
        full_sequence = re.sub('[^GATC]', "",
                               str(seq_record.seq.ungap(' ')).upper())
        sumvect = np.zeros((100, ), dtype=int)
        kmer = 8
Пример #9
0
from joblib import Parallel, delayed
import multiprocessing
from generate_hdf5 import ReadWrite
import time
import glob
from dna2vec.multi_k_model import MultiKModel

parser = argparse.ArgumentParser()
parser.add_argument("--genome_file_path", help="Directory of FASTA file")
parser.add_argument("--dna2vec_file_path",
                    help="Directory where .w2v file is saved (from dna2vec)")
args = parser.parse_args()

num_cores = multiprocessing.cpu_count()
read_write = ReadWrite()
mk_model = MultiKModel(args.dna2vec_file_path)
fasta_files = args.genome_file_path + "*.fa"


def embedding(seq):

    # seq = gene_seq[gene]
    embed_len = (len(seq) - 1) // 3
    gene_vec = [[0] * 100] * embed_len

    for i in range(embed_len):
        gene_vec[i] = mk_model.vector(seq[i * 3:i * 3 + 3])

    return (gene_vec, embed_len)