def pickle_csv(csvfile, pickle_fname=None): tree_file = csv.reader(open(csvfile, 'rb')) if pickle_fname is None: pickle_fname = csvfile + '.pickle' aplo_list = io_modules.csv.parse_csv(tree_file) htree = tree.HaplogroupTree(aplo_list=aplo_list) pickle_file = open(pickle_fname, 'wb') pickle_file.write(htree.serialize())
def data_parsing(file_file, site_file, bestres_file, haptab_file): #apro i file di input #diff = open(diff_file, 'r') file = open(file_file, 'r') site = open(site_file, 'r') bestres = open(bestres_file, 'r') haptab = open(haptab_file, 'r') print "Parsing pathogenicity table..." #dizionario del tabellone pato # scarto la prima riga (intestazione) file = file.readlines()[1:] d = {} d = fillPathoDict2(file, d, '\t', 2) print "Parsing variability data..." #dizionario sitevar # scarto la prima riga (intestazione) site = site.readlines()[1:] g = {} g = fillSiteVarDict(site, g, '\t', 1) for pos in g.keys(): # convert variability values (nt and aa, if available) in floats try: g[pos][1] = float(g[pos][1]) except: pass try: g[pos][4] = float(g[pos][4]) except: pass print "Parsing info about haplogroup-defining sites..." haplo = {} htree = tree.HaplogroupTree( pickle_data=open(data_file + '/data/phylotree_r17.pickle', 'rb').read()) for haplogroup in htree._aplo_dict.keys(): haplo[haplogroup] = [] # change Transition and Transversion datatypes to SNP_MixIn, # because Transition and Transversion variants parsed from merged_diff are that type for event in htree.get_filtered_positions(haplogroup): if event.mutation_type() in ['Transition', 'Transversion']: event_new = SNP_MixIn() event_new.start = event.start event_new.change = event.change haplo[haplogroup].append(event_new) else: haplo[haplogroup].append(event) # la funzione di cui sopra sostituisce la procedura qui sotto #haplo = [line.strip().split('\t') for line in haptab] #lista di liste aplogruppo/variante # DS .. convert strings of variants (eg '146T') in datatypes.SNP (eg Transition(146)) #for i in haplo[1:]: # i[1] = pprint2datatype(i[1]) #print haplo[:4] #hapconto = hapcon.read() #file letto interamente che usero' per contare quante volte e' presente una variante haplo_sites = [] for haplogroup in haplo.keys(): haplo_sites.extend(haplo[haplogroup]) hapconto = Counter(haplo_sites) #hapcon = open(haptab_file, 'r') #hapcont = [line.strip().split('\t') for line in hapcon] #hapconto = [] #for l in hapcont: # for e in l: # hapconto.append(e) #print hapconto.keys()[:10] #print "7146A is found %d times in hapconto" % (hapconto.count('7146A')) print "Parsing info about haplogroup assignments..." #dizionario best results # scarto la prima riga (intestazione) bestres = bestres.readlines()[1:] best = {} best = fillDict(bestres, best, ',', 1) #for s in best.keys(): # best[s] = [best[s][0].split(';')[0]] #print best return d, g, haplo, hapconto, best
sys.exit() elif o == "-i": contig_file = a elif o == "-m": muscle_exe = a elif o == "-b": basename = a elif o == "-s": best_results_file = a else: assert False, "Unhandled option." print "Your best results file is ", best_results_file # sample name f = os.path.abspath(contig_file) #sample_name = f.split('/')[-2].split('_')[-1] sample_name = contig_file.split('-')[0] # haplogroup tree parsing htrees = [(tree.HaplogroupTree(pickle_data=open(data_file + '/phylotree_r15.pickle', 'rb').read()), data_file + '/data/phylotree_r15.pickle')] # mhcs parsing mhcs_dict = parse_mhcs.parse2mhcs_dict(data_file + '/data/mhcs.tab') print "\nLoading contig sequences from file %s" % contig_file contig_array = load_sequences(contig_file) contig_array_seqdiff = [] # lista di liste contig_total_seqdiff = [] # lista di varianti contig_array_mappings = [] print "\nAligning Contigs to mtDNA reference genome...\n" # update each contig's SeqDiff for x,contig in enumerate(contig_array): if x == 0: contig_seq_diff = align_sequence(muscle_exe, contig)
def write_old_table(pickle_fname, out_fname): htree = tree.HaplogroupTree(pickle_data=open(pickle_fname, 'rb').read()) fh = csv.writer(open(out_fname, 'wb')) for haplo_name in htree: io_modules.old_table.write_haplogroup(fh, '', htree[haplo_name])
from bioinf.seqs import SeqList import io_modules.csv import io_modules.old_table import io_modules.serialize import pandas as pd import os.path sys.setrecursionlimit(100000) csvfile, out_fname = sys.argv[1:] #write pickle file for MToolBox tree_file = csv.reader(open(csvfile, 'rb')) pickle_fname = csvfile + '.pickle' aplo_list = io_modules.csv.parse_csv(tree_file) htree = tree.HaplogroupTree(aplo_list=aplo_list) pickle_file = open(pickle_fname, 'wb') pickle_file.write(htree.serialize()) pickle_file.close() #write out alleles and haplogroups defined for HmtDB in csv file pickle_file = pickle_fname out_file = out_fname + '.csv' htree = tree.HaplogroupTree(pickle_data=open(pickle_file, 'rb').read()) fh = csv.writer(open(out_file, 'wb')) for haplo_name in htree: io_modules.old_table.write_haplogroup(fh, '', htree[haplo_name]) #write haplogrups.txt tab delimited file for MToolBox out_file2 = out_fname + '.txt' hap_file = pd.read_csv(out_file, sep=',', header=None)