예제 #1
0
 def __init__(self, upstream, downstream, ref_base=None,  alt_answer=None,
              mut_type=None, path_to_seq_file=None, seq_dir=None, 
              seq_filename = None):
     self.seq_file = get_AB1_file.handle_seq_file(path_to_seq_file, seq_dir)
     # if statement helps with the recursive function in match_with_seq_file(), otherwise the actual reverse complemented sequence is used as the seq_filename, which causes errors in get_het_calls when it tries to open a string instead of an actual file
     self.seq_filename = path_to_seq_file if not seq_filename else seq_filename
     self.upstream = upstream
     self.downstream = downstream
     self.seq_dir = seq_dir
     self.mut_type = mut_type
     self.alt_answer = alt_answer
     self.ref_base = ref_base
예제 #2
0
def parse_file(*args):    
    ''' Parse a file line by line into the get_seq function
    '''
    input_file, output_file, upstream, downstream, hg_version, \
            seq_dir, reference, genome = args
    
    # append all returned data to this list
    all_scrapped_info = []

    for line in [line.rstrip("\n").split("\t") for line in open(input_file)]:
        seq_name = line[0]
        var_pos = line[1]
        
        logging.info("\n\n\t We are looking for {} in sample {}\n\n".format( var_pos, seq_name))

        # Determine the type of mutation
        mutation = line[2]
        ref_base = line[2].split("/")[0]
        alt_answer, mut_type = determine_mutation(mutation)

        logging.debug(" We are looking for a {}\n".format(mutation))

        # check each individual line of the file for CUSTOM ERRORS
        error_check = reference.handle_argument_exception(var_pos)

        # find a list of files with seq_name in its title.
        seq_file = get_AB1_file.get_matching_seq_file(seq_name, seq_dir)

        # produce seq & tab files for each matched file, otherwise return blank line
        if seq_file:
            convert = [get_AB1_file.handle_seq_file(x, seq_dir) 
                       for x in seq_file]
        else:
            all_scrapped_info.append("\t".join((seq_name, var_pos, "-", "-", "-", "-", "-", "0")))    
        
        # intialise the check_sanger CompareSeq class for every found seq_file
        sanger = [CompareSeqs(upstream, downstream, ref_base, alt_answer, 
                              mut_type, x, seq_dir) 
                  for x in seq_file]

        # parse it all into get_seq()
        sequence_info = [get_seq(seq_name, var_pos, mutation, reference,  
                                hg_version, genome, x) for x in sanger]
        
        logging.debug("Sequence Info: ".format(sequence_info))

        # filter out sequences where no seq file was found
        filtered_answers = [x for x in sequence_info if "-" != x[1].split("\t")[4]]
        logging.debug("Filtered Answer: {}".format(filtered_answers))
       
        # check if any of the filtered_answer list contains the het call of interest
        found_answer = find_variant(filtered_answers, ref_base, alt_answer)

        # if a het call was found in some of the matching seq files, then print and return its values. Otherwise, return no seq_file matched values
        if found_answer:
            print_out, answer = found_answer
            print(print_out)
            logging.debug("printing: {}".format(print_out))
            all_scrapped_info.append(answer)

        elif filtered_answers:
            print_out, answer = filtered_answers[0]
            print(print_out)
            logging.debug("printing: {}".format(print_out)) 
            # append each lines returned data to an emty list
            all_scrapped_info.append(answer)

        elif sequence_info:
            print_out, answer = sequence_info[0]
            print(print_out)
            logging.debug("printing: {}".format(print_out)) 
            all_scrapped_info.append(answer)

        # reset variable for next line/iteration
        found_answer = None

    # return all scrapped data
    return all_scrapped_info