def test_parse_genome_declaration_string(self): # genome declaration string is parsed, by stripping the string from the right # and retrieving the string after the ">" character self.assertEqual( GRIMMReader.parse_genome_declaration_string(">genome"), BGGenome("genome")) self.assertEqual( GRIMMReader.parse_genome_declaration_string(" >genome "), BGGenome("genome")) self.assertEqual( GRIMMReader.parse_genome_declaration_string(">genome__genome"), BGGenome("genome__genome")) self.assertEqual( GRIMMReader.parse_genome_declaration_string(">genome>genome"), BGGenome("genome>genome")) self.assertEqual( GRIMMReader.parse_genome_declaration_string(">genome.!/.#4"), BGGenome("genome.!/.#4"))
def get_block_neighbours(grimm_file): block_neighbours = defaultdict(lambda: defaultdict(list)) with open(grimm_file) as f: ls = f.readlines() i = 0 while i < len(ls): l = ls[i] if GRIMMReader.is_genome_declaration_string(l): genome = GRIMMReader.parse_genome_declaration_string(l) data_line = ls[i + 1] bs = GRIMMReader.parse_data_string(data_line)[1] n = len(bs) j = 0 while j < n: tandem_copies = 1 prev_or, prev_block = bs[j % n] _, curr_block = bs[(j + 1) % n] next_or, next_block = bs[(j + 2) % n] if curr_block == prev_block: j += 1 continue while curr_block == next_block: j += 1 tandem_copies += 1 next_or, next_block = bs[(j + 2) % n] neighbours = (prev_block + ('h' if prev_or == '+' else 't'), next_block + ('t' if next_or == '+' else 'h')) orientations = tuple(bs[(k + 1) % n][0] for k in range(j - tandem_copies + 1, j + 1)) if orientations[0] == '-': neighbours = (neighbours[1], neighbours[0]) orientations = tuple('+' if or_ == '-' else '+' for or_ in orientations[::-1]) block_neighbours[int(curr_block)][genome.name].append( (*neighbours, tandem_copies, orientations)) j += 1 i += 2 else: i += 1 return block_neighbours
def get_genomes_contain_blocks_grimm(grimm_file): genomes, blocks = set(), set() with open(grimm_file) as f: ls = f.readlines() block_genome_count = defaultdict(Counter) for i in range(0, len(ls), 2): name = GRIMMReader.parse_genome_declaration_string(ls[i]).name data = GRIMMReader.parse_data_string(ls[i + 1])[1] genomes.add(name) for _, block in data: blocks.add(int(block)) block_genome_count[int(block)][name] += 1 return list(sorted(genomes)), list(sorted(blocks)), block_genome_count
ch.setFormatter(logging.Formatter(args.c_logging_formatter_entry)) logger.info("Starting the converting process") genomes = defaultdict(list) for file_name in args.grimm: logger.info( "Processing file \"{file_name}\"".format(file_name=file_name)) with open(file_name, "rt") as source: current_genome = None for line in source: line = line.strip() if len(line) == 0 or GRIMMReader.is_comment_string( data_string=line): continue if GRIMMReader.is_genome_declaration_string(data_string=line): current_genome = GRIMMReader.parse_genome_declaration_string( data_string=line).name if args.trim_names: current_genome = current_genome.split( args.trimmer_char, 1)[0] elif current_genome is not None: current_chromosome = [] chr_type, blocks = GRIMMReader.parse_data_string( data_string=line) genomes[current_genome].append((chr_type, blocks)) if args.good_genomes != "": good_genomes = args.good_genomes.split(",") if args.trim_names: good_genomes = [ genome_name.split(args.trimmer_char, 1)[0] for genome_name in good_genomes ]
logger.info(full_description) logger.info(parser.format_values()) ch.setFormatter(logging.Formatter(args.c_logging_formatter_entry)) logger.info("Starting the converting process") genomes = defaultdict(list) for file_name in args.grimm: logger.info("Processing file \"{file_name}\"".format(file_name=file_name)) with open(file_name, "rt") as source: current_genome = None for line in source: line = line.strip() if len(line) == 0 or GRIMMReader.is_comment_string(data_string=line): continue if GRIMMReader.is_genome_declaration_string(data_string=line): current_genome = GRIMMReader.parse_genome_declaration_string(data_string=line).name if args.trim_names: current_genome = current_genome.split(args.trimmer_char, 1)[0] elif current_genome is not None: current_chromosome = [] chr_type, blocks = GRIMMReader.parse_data_string(data_string=line) genomes[current_genome].append((chr_type, blocks)) if args.good_genomes != "": good_genomes = args.good_genomes.split(",") if args.trim_names: good_genomes = [genome_name.split(args.trimmer_char, 1)[0] for genome_name in good_genomes] for genome_name in list(genomes.keys()): if genome_name not in good_genomes: del genomes[genome_name] if args.bad_genomes != "": bad_genomes = args.bad_genomes.split(",")