def filter_seqs(input_paths=INPUT_PATHS, input_format=SEQFILE_FORMAT, output_path=OUTPUT_PATH, output_format=SEQFILE_FORMAT, min_length=MIN_LENGTH, max_length=MAX_LENGTH, counted_char=COUNTED_CHAR, min_count=MIN_COUNT, max_count=MAX_COUNT, min_prop=MIN_PROP, max_prop=MAX_PROP): with open_file(output_path, 'w') as output_file: for input_path in input_paths: with open_file(input_path) as input_file: for record in Bio.SeqIO.parse(input_file, input_format): length = len(record) if length < min_length: continue if max_length is not None and length > max_length: continue if counted_char is not None: count = record.seq.count(counted_char) prop = 100.0 * count / length if count < min_count: continue if max_count is not None and count > max_count: continue if prop < min_prop: continue if prop > max_prop: continue write_records(record, output_file, output_format)
def _parallelize_unindexed_sep_part(target_function, input_paths_part, output_paths_part, input_format, output_format, queue, **kwargs): for input_path, output_path in zip(input_paths_part, output_paths_part): with open_file(input_path) as input_file, \ open_file(output_path, 'w') as output_file: records = Bio.SeqIO.parse(input_file, input_format) result = target_function(records, output_file, output_format, **kwargs) queue.put(result)
def count_seqs(input_paths, input_format=SEQFILE_FORMAT, count_seqs=COUNT_SEQS, count_bases=COUNT_BASES, ignore_case=IGNORE_CASE): seq_counts, base_counts = {}, {} for input_path in input_paths: seq_count = 0 with open_file(input_path) as input_file: records = Bio.SeqIO.parse(input_file, input_format) for record in records: if count_seqs: seq_count += 1 if count_bases: seq = record.seq if ignore_case: seq = seq.upper() for base in seq: try: base_counts[base][input_path] += 1 except KeyError: try: base_counts[base][input_path] = 1 except KeyError: base_counts[base] = {input_path: 1} seq_counts[input_path] = seq_count return seq_counts, base_counts
def export_tree(tree_path, terms, types, levels, obsolete, alt, main_ids, children, parents, ancestors): with open_file(tree_path, 'w') as tree_file: tree_table = csv.writer(tree_file, dialect='excel-tab') header = [ "#GO_ID", "term", "type", "level", "is_obsolete", "is_alternative", "main_id", "children", "parents", "ancestors" ] tree_table.writerow(header) for go_id in sorted(terms): try: level = levels[go_id] except KeyError: level = 'NA' try: main_id = main_ids[go_id] except KeyError: main_id = '' row = [ go_id, terms[go_id], types[go_id], level, str(obsolete[go_id]), str(alt[go_id]), main_id ] for d in [children, parents, ancestors]: try: row.append(', '.join(sorted(d[go_id]))) except KeyError: row.append('') tree_table.writerow(row)
def export_table_with_seqids_expanded(results, seqids, info, info_header, output_path, export_level_one_seqids): with open_file(output_path, 'w') as output_file: table = csv.writer(output_file, dialect='excel-tab') header = [ 'GO ID', 'Term', 'Level', 'Ref. count', 'Ref. perc.', 'Sample count', 'Sample perc.', 'FC', 'p-value', 'Reg.', 'Sequence ID' ] if info is not None: header += info_header table.writerow(header) for level in sorted(results): for key in sorted(results[level]): row = results[level][key] go_id = row[0] if level < 2 and not export_level_one_seqids: table.writerow(row) continue lcl_seqids = seqids.get(go_id, None) if lcl_seqids is None: table.writerow(row) continue lcl_seqids.sort() for i in range(len(lcl_seqids)): if not i: final_row = list(row) else: final_row = [''] * len(row) final_row.append(lcl_seqids[i]) if info is not None: gene_info = info.get(lcl_seqids[i], None) if gene_info is not None: final_row += gene_info table.writerow(final_row)
def export_table(output_path, seqids, terms, bp, mf, cc, ec, add_ancestors, bp_anc, mf_anc, cc_anc): if seqids is None: seqids = sorted(list(set(bp.keys()) | set(mf.keys()) | set(cc.keys()))) with open_file(output_path, 'w') as output_file: output_table = csv.writer(output_file, dialect='excel-tab') header = [ 'SeqID', 'Biological process', 'Molecular function', 'Cellular component' ] if add_ancestors: header += [ 'Biological process (with ancestors)', 'Molecular function (with ancestors)', 'Cellular component (with ancestors)' ] header += ['Enzyme codes'] output_table.writerow(header) for seqid in seqids: row = [ seqid, format_terms(seqid, bp, terms), format_terms(seqid, mf, terms), format_terms(seqid, cc, terms) ] if add_ancestors: row += [ format_terms(seqid, bp_anc, terms), format_terms(seqid, mf_anc, terms), format_terms(seqid, cc_anc, terms) ] row.append(format_ec_codes(seqid, ec)) output_table.writerow(row)
def generate_random_sequences_part( output_path, output_format, seq_type, min_length, max_length, nb_sequences, prefix, start_index, index_width): with open_file(output_path, 'w') as output_file: for i in range(nb_sequences): seqid = make_seqid(prefix, i + start_index, index_width) length = define_length(min_length, max_length) if seq_type == 'dna': seq = generate_random_seq(length, DNA_NTS) alphabet = DNA_ALPHA() elif seq_type == 'rna': seq = generate_random_seq(length, RNA_NTS) alphabet = RNA_ALPHA() elif seq_type == 'aa': seq = generate_random_seq(length, AMINOACIDS) alphabet = PROT_ALPHA() elif seq_type == 'prot': seq = generate_random_seq(length-1, AMINOACIDS, start=START_AA) alphabet = PROT_ALPHA() elif seq_type == 'prot_stop': seq = generate_random_seq(length-1, AMINOACIDS, start=START_AA, end=STOP_AA) alphabet = PROT_ALPHA() elif seq_type == 'cds': seq = generate_random_seq(length//3-2, CODONS, start=START_CODONS, end=STOP_CODONS) alphabet = DNA_ALPHA() record = Bio.SeqRecord.SeqRecord( id=seqid, seq=Bio.Seq.Seq(seq, alphabet), description=DESCRIPTION) write_records(record, output_file, output_format)
def merge_outputs(tmp_output_paths, output_path): with open_file(output_path, 'w') as output_file: for tmp_output_path in tmp_output_paths: with open(tmp_output_path) as tmp_output_file: for line in tmp_output_file: output_file.write(line) for tmp_output_path in tmp_output_paths: os.remove(tmp_output_path)
def import_seqids(seqids_path): if seqids_path is not None: seqids = [] with open_file(seqids_path) as seqids_file: for line in seqids_file: line = line.strip() seqids.append(line) return seqids return None
def import_seqids(seqids_path): seqids = set() with open_file(seqids_path) as seqids_file: seqids_table = csv.reader(seqids_file, dialect='excel-tab') for row in seqids_table: if not row: continue seqids |= {row[0]} return seqids
def import_annotations(annot_paths, types, levels, obsolete, main_ids, add_ancestors, ancestors, min_level=MIN_LEVEL, max_level=MAX_LEVEL): bp, mf, cc, ec = {}, {}, {}, {} bp_anc, mf_anc, cc_anc = None, None, None if add_ancestors: bp_anc, mf_anc, cc_anc = {}, {}, {} for annot_path in annot_paths: with open_file(annot_path) as annot_file: for row in csv.reader(annot_file, dialect='excel-tab'): seqid = row[0] go_id = row[1] if not (go_id.startswith('GO:') or go_id.startswith('EC:')): print_stderr( 'ERROR: Unknown annotation type: {}.'.format(go_id)) continue if go_id.startswith('EC:'): updict_add_to_set(ec, seqid, go_id) continue try: go_type = types[go_id] except KeyError: print_stderr( 'ERROR: Annotation not found: {}.'.format(go_id)) continue if obsolete[go_id]: continue if levels[go_id] < min_level: continue if max_level is not None and levels[go_id] > max_level: continue try: go_id = main_ids[go_id] except KeyError: pass if go_type == 'biological_process': add_annotation(bp, seqid, go_id, bp_anc, ancestors) elif go_type == 'molecular_function': add_annotation(mf, seqid, go_id, mf_anc, ancestors) elif go_type == 'cellular_component': add_annotation(cc, seqid, go_id, cc_anc, ancestors) return bp, mf, cc, ec, bp_anc, mf_anc, cc_anc
def export_table(results, output_path): with open_file(output_path, 'w') as output_file: table = csv.writer(output_file, dialect='excel-tab') header = [ 'GO ID', 'Term', 'Level', 'Ref. count', 'Ref. perc.', 'Sample count', 'Sample perc.', 'FC', 'p-value', 'Reg.' ] table.writerow(header) for level in sorted(results): for key in sorted(results[level]): table.writerow(results[level][key])
def import_values(values_path, integers=False): values = [] with open_file(values_path) as input_file: for line in input_file: value = line.strip() if not value: continue if integers: value = int(value) values.append(value) return values
def extract_records_complex(seqids, descriptions, names, mol_types, taxids, comparison_func, inverse=INVERSE, input_paths=INPUT_PATHS, input_format=SEQFILE_FORMAT, output_path=OUTPUT_PATH, output_format=SEQFILE_FORMAT): with open_file(output_path, 'w') as output_file: output_alphabet = None for input_path in input_paths: with open_file(input_path) as input_file: records = Bio.SeqIO.parse(input_file, input_format) for record in records: keep_record = good_record( record, seqids, descriptions, names, mol_types, taxids, comparison_func) if inverse: keep_record = not keep_record if not keep_record: continue write_records(record, output_file, output_format)
def import_info(info_path): if info_path is None: return None, None info = {} with open_file(info_path) as info_file: info_table = csv.reader(info_file, dialect='excel-tab') header = next(info_table) info_header = header[1:] for row in info_table: seqid, desc = row[0], row[1:] info[seqid] = desc return info, info_header
def import_obo(obo_path): terms, types, alt, alt_ids, obsolete = {}, {}, {}, {}, {} parents, children = {}, {} with open_file(obo_path, 'r') as obo_file: for line in obo_file: line = line.strip() if line == '[Typedef]': break elif line.startswith('id: '): go_id = line[4:] obsolete[go_id] = False alt[go_id] = False alt_ids[go_id] = [] elif line.startswith('name: '): terms[go_id] = line[6:] elif line.startswith('namespace: '): types[go_id] = line[11:] elif line == 'is_obsolete: true': obsolete[go_id] = True elif line.startswith('alt_id: '): alt_id = line[8:] if '!' in alt_id: alt_id = alt_id[:alt_id.index('!')] alt_id = alt_id.strip() alt[alt_id] = True updict_append_to_list(alt_ids, go_id, alt_id) else: for prefix in PARENT_PREFIXES: if line.startswith(prefix): parent = line[len(prefix) - 2:] if '!' in parent: parent = parent[:parent.index('!')] parent = parent.strip() updict_append_to_list(parents, go_id, parent) updict_append_to_list(children, parent, go_id) for prefix in CHILD_PREFIXES: if line.startswith(prefix): child = line[len(prefix) - 2:] if '!' in child: child = child[:child.index('!')] child = child.strip() updict_append_to_list(parents, child, go_id) updict_append_to_list(children, go_id, child) return terms, types, alt, alt_ids, obsolete, parents, children
def _parallelize_indexed_sep_part(target_function, input_paths_part, index_paths_part, output_paths_part, input_format, output_format, queue, **kwargs): paths = zip(input_paths_part, index_paths_part, output_paths_part) for input_path, index_path, output_path in paths: with open_file(output_path, 'w') as output_file: if index_path is None: records = Bio.SeqIO.index(input_path, input_format) else: records = Bio.SeqIO.index_db(index_path, input_path, input_format) results = target_function(records, output_file, output_format, **kwargs) queue.put(results)
def import_tree(tree_path): terms, types, levels = {}, {}, {} obsolete, main_ids, ancestors = {}, {}, {} with open_file(tree_path) as tree_file: for row in csv.reader(tree_file, dialect='excel-tab'): go_id = row[0] terms[go_id] = row[1] types[go_id] = row[2] try: levels[go_id] = int(row[3]) except ValueError: levels[go_id] = None obsolete[go_id] = True if row[4] == 'True' else False if row[5] == 'True': main_ids[go_id] = row[6] ancestors[go_id] = set(row[-1].split(', ')) return terms, types, levels, obsolete, main_ids, ancestors
def export_table_with_seqids(results, seqids, output_path, export_level_one_seqids): with open_file(output_path, 'w') as output_file: table = csv.writer(output_file, dialect='excel-tab') header = [ 'GO ID', 'Term', 'Level', 'Ref. count', 'Ref. perc.', 'Sample count', 'Sample perc.', 'FC', 'p-value', 'Reg.', 'Sequence IDs' ] table.writerow(header) for level in sorted(results): for key in sorted(results[level]): row = list(results[level][key]) go_id = row[0] if level > 1 or export_level_one_seqids: row += [', '.join(sorted(seqids[go_id]))] table.writerow(row)
def extract_records_by_seqid(seqids, input_paths=INPUT_PATHS, input_format=SEQFILE_FORMAT, index_path=INDEX_PATH, output_path=OUTPUT_PATH, output_format=SEQFILE_FORMAT): records_list = make_indexed_records_list( input_paths, index_path, input_format) with open_file(output_path, 'w') as output_file: not_found = [] for seqid in seqids: try: record = get_indexed_record(seqid, records_list) except KeyError: not_found.append(seqid) else: write_records(record, output_file, output_format) if not_found: print_stderr('{:d} identifier(s) not found: {}'.format( len(not_found), ', '.join(not_found)))
def export_tex(results, output_path, name, go_type_long, header=TEX_HEADER, footer=TEX_FOOTER): name = name.replace('_', '\\_') go_type_long = go_type_long.replace('_', '\\_') with open_file(output_path, 'w') as output_file: output_file.write(''.join([ header[0], go_type_long, header[1], name, header[2], name, header[3] ])) for level in sorted(results): output_file.write('\\midrule\n') for key in sorted(results[level]): row = format_tex_row(results[level][key]) output_file.write(row + '\n') output_file.write(footer)