Exemplo n.º 1
0
def pickle_csv(csvfile, pickle_fname=None):
    tree_file = csv.reader(open(csvfile, 'rb'))
    if pickle_fname is None:
        pickle_fname = csvfile + '.pickle'
    aplo_list = io_modules.csv.parse_csv(tree_file)
    htree = tree.HaplogroupTree(aplo_list=aplo_list)
    pickle_file = open(pickle_fname, 'wb')
    pickle_file.write(htree.serialize())
Exemplo n.º 2
0
def data_parsing(file_file, site_file, bestres_file, haptab_file):
    #apro i file di input
    #diff = open(diff_file, 'r')
    file = open(file_file, 'r')
    site = open(site_file, 'r')
    bestres = open(bestres_file, 'r')
    haptab = open(haptab_file, 'r')

    print "Parsing pathogenicity table..."
    #dizionario del tabellone pato
    # scarto la prima riga (intestazione)
    file = file.readlines()[1:]
    d = {}
    d = fillPathoDict2(file, d, '\t', 2)

    print "Parsing variability data..."
    #dizionario sitevar
    # scarto la prima riga (intestazione)
    site = site.readlines()[1:]
    g = {}
    g = fillSiteVarDict(site, g, '\t', 1)
    for pos in g.keys():
        # convert variability values (nt and aa, if available) in floats
        try:
            g[pos][1] = float(g[pos][1])
        except:
            pass
        try:
            g[pos][4] = float(g[pos][4])
        except:
            pass

    print "Parsing info about haplogroup-defining sites..."
    haplo = {}
    htree = tree.HaplogroupTree(
        pickle_data=open(data_file +
                         '/data/phylotree_r17.pickle', 'rb').read())
    for haplogroup in htree._aplo_dict.keys():
        haplo[haplogroup] = []
        # change Transition and Transversion datatypes to SNP_MixIn,
        # because Transition and Transversion variants parsed from merged_diff are that type
        for event in htree.get_filtered_positions(haplogroup):
            if event.mutation_type() in ['Transition', 'Transversion']:
                event_new = SNP_MixIn()
                event_new.start = event.start
                event_new.change = event.change
                haplo[haplogroup].append(event_new)
            else:
                haplo[haplogroup].append(event)
    # la funzione di cui sopra sostituisce la procedura qui sotto
    #haplo = [line.strip().split('\t') for line in haptab] #lista di liste aplogruppo/variante
    # DS .. convert strings of variants (eg '146T') in datatypes.SNP (eg Transition(146))
    #for i in haplo[1:]:
    #	i[1] = pprint2datatype(i[1])
    #print haplo[:4]
    #hapconto = hapcon.read() #file letto interamente che usero' per contare quante volte e' presente una variante
    haplo_sites = []
    for haplogroup in haplo.keys():
        haplo_sites.extend(haplo[haplogroup])
    hapconto = Counter(haplo_sites)
    #hapcon = open(haptab_file, 'r')
    #hapcont = [line.strip().split('\t') for line in hapcon]
    #hapconto = []
    #for l in hapcont:
    #	for e in l:
    #		hapconto.append(e)
    #print hapconto.keys()[:10]
    #print "7146A is found %d times in hapconto" % (hapconto.count('7146A'))

    print "Parsing info about haplogroup assignments..."
    #dizionario best results
    # scarto la prima riga (intestazione)
    bestres = bestres.readlines()[1:]
    best = {}
    best = fillDict(bestres, best, ',', 1)
    #for s in best.keys():
    #	best[s] = [best[s][0].split(';')[0]]
    #print best
    return d, g, haplo, hapconto, best
Exemplo n.º 3
0
			sys.exit()
		elif o == "-i": contig_file = a
		elif o == "-m": muscle_exe = a
		elif o == "-b": basename = a
		elif o == "-s": best_results_file = a
		else:
			assert False, "Unhandled option."

	print "Your best results file is ", best_results_file
	# sample name
	f = os.path.abspath(contig_file)
	#sample_name = f.split('/')[-2].split('_')[-1]
	sample_name = contig_file.split('-')[0]
	
	# haplogroup tree parsing
	htrees = [(tree.HaplogroupTree(pickle_data=open(data_file + '/phylotree_r15.pickle', 'rb').read()), data_file + '/data/phylotree_r15.pickle')]
	# mhcs parsing
	mhcs_dict = parse_mhcs.parse2mhcs_dict(data_file + '/data/mhcs.tab')
	
	print "\nLoading contig sequences from file %s" % contig_file
	contig_array = load_sequences(contig_file)
	contig_array_seqdiff = [] # lista di liste
	contig_total_seqdiff = [] # lista di varianti
	contig_array_mappings = []
	
	print "\nAligning Contigs to mtDNA reference genome...\n"
	
	# update each contig's SeqDiff
	for x,contig in enumerate(contig_array):
		if x == 0:
			contig_seq_diff = align_sequence(muscle_exe, contig)
Exemplo n.º 4
0
def write_old_table(pickle_fname, out_fname):
    htree = tree.HaplogroupTree(pickle_data=open(pickle_fname, 'rb').read())
    fh = csv.writer(open(out_fname, 'wb'))
    for haplo_name in htree:
        io_modules.old_table.write_haplogroup(fh, '', htree[haplo_name])
from bioinf.seqs import SeqList
import io_modules.csv
import io_modules.old_table
import io_modules.serialize
import pandas as pd
import os.path
sys.setrecursionlimit(100000)

csvfile, out_fname = sys.argv[1:]

#write pickle file for MToolBox
tree_file = csv.reader(open(csvfile, 'rb'))

pickle_fname = csvfile + '.pickle'
aplo_list = io_modules.csv.parse_csv(tree_file)
htree = tree.HaplogroupTree(aplo_list=aplo_list)
pickle_file = open(pickle_fname, 'wb')
pickle_file.write(htree.serialize())
pickle_file.close()

#write out alleles and haplogroups defined for HmtDB in csv file
pickle_file = pickle_fname
out_file = out_fname + '.csv'
htree = tree.HaplogroupTree(pickle_data=open(pickle_file, 'rb').read())
fh = csv.writer(open(out_file, 'wb'))
for haplo_name in htree:
    io_modules.old_table.write_haplogroup(fh, '', htree[haplo_name])

#write haplogrups.txt tab delimited file for MToolBox
out_file2 = out_fname + '.txt'
hap_file = pd.read_csv(out_file, sep=',', header=None)