def __init__(self, upstream, downstream, ref_base=None, alt_answer=None, mut_type=None, path_to_seq_file=None, seq_dir=None, seq_filename = None): self.seq_file = get_AB1_file.handle_seq_file(path_to_seq_file, seq_dir) # if statement helps with the recursive function in match_with_seq_file(), otherwise the actual reverse complemented sequence is used as the seq_filename, which causes errors in get_het_calls when it tries to open a string instead of an actual file self.seq_filename = path_to_seq_file if not seq_filename else seq_filename self.upstream = upstream self.downstream = downstream self.seq_dir = seq_dir self.mut_type = mut_type self.alt_answer = alt_answer self.ref_base = ref_base
def parse_file(*args): ''' Parse a file line by line into the get_seq function ''' input_file, output_file, upstream, downstream, hg_version, \ seq_dir, reference, genome = args # append all returned data to this list all_scrapped_info = [] for line in [line.rstrip("\n").split("\t") for line in open(input_file)]: seq_name = line[0] var_pos = line[1] logging.info("\n\n\t We are looking for {} in sample {}\n\n".format( var_pos, seq_name)) # Determine the type of mutation mutation = line[2] ref_base = line[2].split("/")[0] alt_answer, mut_type = determine_mutation(mutation) logging.debug(" We are looking for a {}\n".format(mutation)) # check each individual line of the file for CUSTOM ERRORS error_check = reference.handle_argument_exception(var_pos) # find a list of files with seq_name in its title. seq_file = get_AB1_file.get_matching_seq_file(seq_name, seq_dir) # produce seq & tab files for each matched file, otherwise return blank line if seq_file: convert = [get_AB1_file.handle_seq_file(x, seq_dir) for x in seq_file] else: all_scrapped_info.append("\t".join((seq_name, var_pos, "-", "-", "-", "-", "-", "0"))) # intialise the check_sanger CompareSeq class for every found seq_file sanger = [CompareSeqs(upstream, downstream, ref_base, alt_answer, mut_type, x, seq_dir) for x in seq_file] # parse it all into get_seq() sequence_info = [get_seq(seq_name, var_pos, mutation, reference, hg_version, genome, x) for x in sanger] logging.debug("Sequence Info: ".format(sequence_info)) # filter out sequences where no seq file was found filtered_answers = [x for x in sequence_info if "-" != x[1].split("\t")[4]] logging.debug("Filtered Answer: {}".format(filtered_answers)) # check if any of the filtered_answer list contains the het call of interest found_answer = find_variant(filtered_answers, ref_base, alt_answer) # if a het call was found in some of the matching seq files, then print and return its values. Otherwise, return no seq_file matched values if found_answer: print_out, answer = found_answer print(print_out) logging.debug("printing: {}".format(print_out)) all_scrapped_info.append(answer) elif filtered_answers: print_out, answer = filtered_answers[0] print(print_out) logging.debug("printing: {}".format(print_out)) # append each lines returned data to an emty list all_scrapped_info.append(answer) elif sequence_info: print_out, answer = sequence_info[0] print(print_out) logging.debug("printing: {}".format(print_out)) all_scrapped_info.append(answer) # reset variable for next line/iteration found_answer = None # return all scrapped data return all_scrapped_info