def excise_precursors(): global file_genome, parsed_arf, dir_tmp, stack_height_min, dir_tmp, max_pres # excise precursors from the genome pprint("#excising precursors\n") print_stderr("#excising precursors\n") start() ret_excise_precursors = None if options.get('-a'): cmd = "excise_precursors.py {} {}/{}_parsed.arf {}/precursors.coords -a {} > {}/precursors.fa\n\n".format( file_genome, dir_tmp, parsed_arf, dir_tmp, stack_height_min, dir_tmp) print_stderr(cmd) ret_excise_precursors = os.popen(cmd).read() else: cmd = "excise_precursors_iterative_final.py {} {}/{}_parsed.arf {}/precursors.fa {}/precursors.coords {}\n".format( file_genome, dir_tmp, parsed_arf, dir_tmp, dir_tmp, max_pres) print_stderr(cmd) ret_excise_precursors = os.popen(cmd).read() fname = '{}/precursors.fa_stack'.format(dir_tmp) OSS = open_or_die2(fname, 'rb') stack_height_min = OSS.readline().strip() OSS.close() end() fname = '{}/precursors.fa'.format(dir_tmp) # if (-z "$dir_tmp/precursors.fa" or not -f "$dir_tmp/precursors.fa"): if not file_s(fname) or not os.path.isfile( fname): # empty or not a regular plain file die("No precursors excised\n") return 0
def parse_file_fasta_seqkey(file_fasta, hsh, options): if options.get('-a') == '': print_stderr('reading file into hash\n') _id = '' seq = '' running_1 = 0 FASTA = open_or_die2(file_fasta, 'rb') while True: l = FASTA.readline().strip() if not l: break m = re.match(r'^>(\S+)', l) if m: _id = m.group() seq = '' while True: ll = FASTA.readline().strip() if not ll: break mm = re.match(r'^>(\S+)', ll) if mm: cnt = find_cnt(_id) seq = tr(seq, '[acgtun.]', '[ACGTTNN]') # ATTR: Performance issue below: # create_hash_key_chain(hsh, 0, seq) try: hsh[seq] = (hsh[seq]) + cnt except KeyError: hsh[seq] = cnt running_1 += 1 if options.get('-a') == '': print_stderr('{}\r'.format(running_1)) _id = mm.group() seq = '' continue seq += ll cnt = find_cnt(_id) seq = tr(seq, '[acgtun.]', '[ACGTTNN]') create_hash_key_chain(hsh, 0, seq) hsh[seq] += cnt running_1 += 1 if options.get('-a') == '': print_stderr('{}\r'.format(running_1)) FASTA.close()
if re.search(r'\s+', _id): die('Error in line {}: The identifier\n {}\n\ncontains white spaces\n\n{}\n\nYou could run remove_white_space_in_id.py inputfile > newfile\nThis will remove everything from the id line after the first whitespace\n' .format(Nicenumber(counter), _id, hint)) else: create_hash_key_chain(hash_num, 0, _id) hash_num[_id] += 1 elif not re.match(r'^([A|C|G|T|U|N|a|c|g|t|u|n]+)$', rin): die('Error in line {}: The sequence\n{}\n\ncontains characters others than [acgtunACGTUN]\n\n{}' .format(Nicenumber(counter), rin, hint)) if __name__ == '__main__': hash_num = {} _id = None hint = 'Please check your file for the following issues:\n\nI. Sequences are allowed only to comprise characters [ACGTNacgtn].\nII. Identifiers are not allowed to have withespaces.\n' if len(sys.argv) == 1: # from stdin read_handler(sys.stdin) else: # from files for f in sys.argv[1:]: IN = open_or_die2(f, 'rb') read_handler(IN) IN.close() sys.exit(0)
line = [] thres = -50 if options.get('-s') is not None: thres = options.get('-s') score = thres _max = 'na' maxs = 999999999999999999999999999 if options.get('-t'): _max = options.get('-t') maxs = _max IN = open_or_die2(options.get('-r'), 'rb') seqcol = 15 if options.get('-m') == '': seqcol = 13 if options.get('-k') == '': seqcol = 14 names = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', 'mature', 'star', 'pres') while True: l = IN.readline() if not l: break
def test_input_files(): global file_reads, file_reads_vs_genome, file_genome, file_precursors, minpreslen, file_mature_ref_other_species, file_mature_ref_this_species IN = open_or_die2(file_reads, 'rb') line = IN.readline().strip() if not re.search(r'^>\S+', line): printErr() die("The first line of file $file_reads does not start with '>identifier'\nReads file {} is not a valid fasta file\n\n" .format(file_reads)) if re.search(r'\s', line): printErr() die('File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nReads file {} is not a fasta file\n\n' .format(file_reads, file_reads)) line = IN.readline() if not re.search(r'^[ACGTUNacgtun]*$', line): printErr() die('File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nReads file {} is not a fasta file\n\n' .format(file_reads, file_reads)) IN.close() IN = open_or_die2(file_genome, 'rb') line = IN.readline().strip() if not re.search(r'>\S+', line): printErr() die("The first line of file {} does not start with '>identifier'\nGenome file {} is not a fasta file\n\n" .format(file_genome, file_genome)) if re.search(r'\s', line): printErr() die('Genome file {} has not allowed whitespaces in its first identifier\n\n' .format(file_genome)) # get genome ids tmps = os.popen('grep ">" {}'.format(file_genome)).read().strip() genomeids = dict(map(lambda x: (x, 1), re.split("\n", tmps))) line = IN.readline() if not re.search(r'^[ACGTUNacgtun]*$', line): printErr() die('File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nGenome file {} is not a fasta file\n\n' .format(file_genome, file_genome)) IN.close() IN = open_or_die2(file_reads_vs_genome, 'rb') line = IN.readline() if not re.search( r'^(\S+_x\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+([+-])\s+(\d+)\s*([mDIM]*)$', line): printErr() die('Mapping file {} is not in arf format\n\nEach line of the mapping file must consist of the following fields\nreadID_wo_whitespaces length start end read_sequence genomicID_wo_whitspaces length start end genomic_sequence strand #mismatches editstring\nThe editstring is optional and must not be contained\nThe readID must end with _xNumber and is not allowed to contain whitespaces.\nThe genomeID is not allowed to contain whitespaces.' .format(file_reads_vs_genome)) IN.close() # get ids from arf file and compare them with ids from the genome file tmps = os.popen( 'cut -f6 {}|sort -u'.format(file_reads_vs_genome)).read().strip() for s in re.split("\n", tmps): if not genomeids.get(">{}".format(s)): die("The mapped reference id {} from file {} is not an id of the genome file {}\n\n" .format(s, file_reads_vs_genome, file_genome)) if not re.search('none', file_mature_ref_this_species): IN = open_or_die2(file_mature_ref_this_species, 'rb') line = IN.readline().strip() if not re.search(r'>\S+', line): printErr() die("The first line of file {} does not start with '>identifier'\nmiRNA reference this species file {} is not a fasta file\n\n" .format(file_mature_ref_this_species, file_mature_ref_this_species)) if re.search(r'\s', line): printErr() die("miRNA reference this species file {} has not allowed whitespaces in its first identifier\n\n" .format(file_mature_ref_this_species)) line = IN.readline() if not re.search(r'^[ACGTUNacgtun]*$', line): printErr() die("File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nmiRNA reference this species file {} is not a fasta file\n\n" .format(file_mature_ref_this_species, file_mature_ref_this_species)) IN.close() if not re.search('none', file_mature_ref_other_species): IN = open_or_die2(file_mature_ref_other_species, 'rb') line = IN.readline().strip() if not re.search(r'>\S+', line): printErr() die("The first line of file {} does not start with '>identifier'\nmiRNA reference this species file {} is not a fasta file\n\n" .format(file_mature_ref_other_species, file_mature_ref_other_species)) if re.search(r'\s', line): printErr() die("miRNA reference this species file {} has not allowed whitespaces in its first identifier\n\n" .format(file_mature_ref_other_species)) line = IN.readline() if not re.search(r'^[ACGTUNacgtun]*$', line): printErr() die("File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nmiRNA reference this species file {} is not a fasta file\n\n" .format(file_mature_ref_other_species, file_mature_ref_other_species)) IN.close() if not re.search('none', file_precursors): IN = open_or_die2(file_precursors, 'rb') line = IN.readline().strip() if not re.search(r'>\S+', line): printErr() die("The first line of file {} does not start with '>identifier'\nmiRNA reference this species file {} is not a fasta file\n\n" .format(file_precursors, file_precursors)) if re.search(r'\s', line): printErr() die("precursor file {} has not allowed whitespaces in its first identifier\n\n" .format(file_precursors)) line = IN.readline() if not re.search(r'^[ACGTUNacgtun]*$', line): printErr() die("File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nmiRNA reference this species file {} is not a fasta file\n\n" .format(file_precursors, file_precursors)) if len(line) < minpreslen: printErr() die("The precursor file {} does not contain sequences of at least {} nt\nPlease make sure that you provided the correct file and the correct parameter ordering when calling {}\nIf you have precursors with less than {} please use option -p <int> to specify this length\n" .format(file_precursors, minpreslen, sys.argv[0], minpreslen)) IN.close() # ################################################# # precheck finished # ################################################# # do stringent testing of all input files pprint("#testing input files\n") print_stderr("#testing input files\n") if not re.search('none', file_mature_ref_this_species): start() cmd = "sanity_check_mature_ref.py {} 2>&1\n\n".format( file_mature_ref_this_species) print_stderr(cmd) ret_file_mature_ref_this_species = os.popen(cmd).read().strip() if ret_file_mature_ref_this_species: printErr() die("problem with {} {}\n".format( file_mature_ref_this_species, ret_file_mature_ref_this_species)) end() if not re.search(r'none', file_mature_ref_other_species): start() cmd = "sanity_check_mature_ref.py {} 2>&1\n\n".format( file_mature_ref_other_species) print_stderr(cmd) ret_file_mature_ref_other_species = os.popen(cmd).read().strip() if ret_file_mature_ref_other_species: printErr() die("problem with {} {}\n".format( file_mature_ref_other_species, ret_file_mature_ref_other_species)) end() cmd = "sanity_check_reads_ready_file.py {} 2>&1\n\n".format(file_reads) print_stderr(cmd) start() ret_test_file_reads = os.popen(cmd).read().strip() if ret_test_file_reads: printErr() die("problem with {} {}\n".format(file_reads, ret_test_file_reads)) end() start() cmd = "sanity_check_genome.py {} 2>&1;\n\n".format(file_genome) print_stderr(cmd) ret_test_file_genome = os.popen(cmd).read().strip() if ret_test_file_genome: printErr() die("problem with {} {}\n".format(file_genome, ret_test_file_genome)) end() start() cmd = "sanity_check_mapping_file.py {} 2>&1".format(file_reads_vs_genome) print_stderr(cmd) ret_test_file_reads_genome = os.popen(cmd).read().strip() if ret_test_file_reads_genome: printErr() die("problem with {} {}\n".format(file_reads_vs_genome, ret_test_file_reads_genome)) end() if not re.search('none', file_precursors): start() cmd = "sanity_check_mature_ref.py {} 2>&1".format(file_precursors) print_stderr(cmd) ret_file_precursors = os.popen(cmd).read().strip() if ret_file_precursors: printErr() die("problem with {} {}\n".format(file_precursors, ret_file_precursors)) end() start() if not re.search('none', file_mature_ref_this_species, re.IGNORECASE): print_stderr("Quantitation of expressed miRNAs in data\n\n\n") species = '' if options.get('-t'): species = "-t {}".format(options.get('-t')) file_star = '' if options.get('-s'): if file_s(options.get('-s')): file_star = "-s {}".format(options.get('-s')) else: print_stderr( "File {} specified by option -s is empty or not found\n" .format(options.get('-s'))) options['-s'] = 0 print("#Quantitation of known miRNAs in data\n") dopt = "" Popt = "" if options.get('-d') == '': dopt = "-d" if options.get('-P') == '': Popt = "-P" quant = "quantifier.py -p {} -m {} -r {} {} {} -y {} -k {} {}".format( file_precursors, file_mature_ref_this_species, file_reads, file_star, species, ltime, dopt, Popt) print_stderr(quant, "\n") os.system(quant) options[ '-q'] = "expression_analyses/expression_analyses_{}/miRBase.mrd".format( ltime) end() else: print_stderr( "Pre-quantitation is skipped caused by missing file with known miRNAs\n\n\n" ) else: print_stderr( "Pre-quantitation is skipped caused by missing file with known precursor miRNAs\n\n\n" )