def get_mutation_rep_time(mutation_file, sample_name): """ Returns list with replication timings of mutated nucleotides with given sample name, mutation motif and final nucleotide""" sys.stdout.write("\nConsidering {0} sample: ".format(sample_name)) mutation_rep_time = [] # We consider chromosomes separately to avoid memory overflow: # All genome in str format ~ 3.1 GB - too much for RAM in my PC genome_file_names = core.get_genome_file_names() for chromosome in genome_file_names: sys.stdout.write(chromosome + ', ') sys.stdout.flush() mutations_list = core.read_mutations(mutation_file, mutation_type='subs', chromosome=chromosome, sample_names=[sample_name], final_nucleotides=core.FINAL_NUCL) with open(genome_file_names[chromosome]) as genome_file: genom = genome_file.read() for index, mutation in mutations_list.iterrows(): # FIXME: Considering mutations_list 20 times - unefficient position = mutation['positionFrom'] motif = genome[position - 2: position + 1] if motif in core.MOTIFS: rep_time = core.calculate_replication_timing(chromosome, position) if rep_time == -1: print '\nuncalculatable replication time at\ {0}:{1}'.format(chromosome, position) mutation_rep_time.append(rep_time) del genome return mutation_rep_time
def filterMutations(mutations_file, catalogFileName, out_file): """ in: file with mutations; catalog with list of genome samples out: writes all mutations (exclude indel mutations and exome samples) to out file, specified by out_file; returns nothing """ genomeSampleNames = [] # all samples except exome samples with open(catalogFileName) as catalogFile: genomeSampleNames = catalogFile.readline()[:-1].split('\t') genomeSampleNames.pop(0) # First and second words are "Mutation type" mutations = core.read_mutations(mutations_file, mutation_type='subs', sample_names=genomeSampleNames) mutations.to_csv(out_file, sep='\t', header=False, index=False) return
def check_chromosome(genome_file, mutations_file, chromosome): """Checks is (initialNucleotide, position) in mutations_file placed according to genome_file""" with open(genome_file, 'r') as f: genome_sequence = f.read() mutations_list = core.read_mutations(mutations_file, mutation_type='subs', chromosome=chromosome) for index, mutation in mutations_list.iterrows(): position = mutation['positionFrom'] genome_nucleotide = genome_sequence[position - 1] mutation_nucleotide = mutation['initialNucl'] if genome_nucleotide != mutation_nucleotide: message = '\n{0}:{1} nucleotide in genome ({2}) and\ mutations file ({3}) not equal' sys.exit(message.format(chromosome, position, genome_nucleotide, mutation_nucleotide)) print "Chromosome {0} check succeed".format(chromosome) return