def main(): start_time = time() parser = cmd_parse() # Parsing of command line arguments with open(parser['file']) as genome: fasta_genome = to_dict(parse(genome, 'fasta')) # Reading genome file jobs = parser['jobs'] # Number of processes to parallelize fragments = parser['fragments_num'] # Number of fragments to get frags_per_core = [fragments // jobs] * (jobs - 1) frags_per_core.append(fragments - sum(frags_per_core)) # Number of fragment to get from one process dis_file = parser['dis_file'] # Address of empirical distribution file emp_dis = rfd(dis_file) if dis_file is not None else None # Empirical distribution reading my_seed = parser['seed'] # Numpy seeding argument processes = [] # List of processes to parallelize for job, fragments_num in enumerate(frags_per_core): seeding = ((my_seed + job) % MAX_SEED if my_seed != -1 else my_seed) if my_seed is not None else None # Processing of seeding argument processes.append(Process(target=disassembler, args=(fasta_genome, parser['seq_type'], fragments_num, parser['out_file'], parser['depth'], parser['read_len'], job, seeding, emp_dis, parser['mean_len']))) processes[-1].start() for process in processes: process.join() print_verbose( 'The program completed disassembling without any errors. Elapsed time={:f}'.format(time() - start_time), parser['session_id'], parser['logfile'], parser['verbose'], parser['params']) # Parameters logging
def domain_filter(in_file, iromppath, lowqual, threads): q = run_hmmsearch(in_file, iromppath + '/PF07715.hmm', threads) if not q: return [] in_file_dict = to_dict(parse(StringIO(in_file), 'fasta')) filter1 = '' for h in q.hit_keys: filter1 += in_file_dict[h].format("fasta") print(f'Filtered {str(filter1.count(">"))} proteins with {q.description}') if lowqual is True or len(filter1) == 0: return filter1 else: q = run_hmmsearch(filter1, iromppath + '/PF00593.hmm', threads) filter2 = '' filter1_dict = to_dict(parse(StringIO(filter1), 'fasta')) for h in q.hit_keys: filter2 += filter1_dict[h].format("fasta") print( f'Filtered {str(filter2.count(">"))} proteins with {q.description}' ) return filter2
def export_proteins(in_file, out_file, name, hits): to_write = [] in_file_dict = to_dict(parse(StringIO(in_file), 'fasta')) for row in hits.itertuples(): r = in_file_dict[row.query] r.id = row.query r.description = f'{row.description} {row.hit}' to_write.append(r) if out_file == '': if name == '-': out_file = 'stdin_sideroscanner.faa' else: out_file = f'{name}_sideroscanner.faa' write(to_write, out_file, "fasta") print(f'[-] Proteins written to: {out_file}')
def tfbs_screen(in_file, furpath, hits, length): prom_dict = to_dict(parse(StringIO(in_file), 'fasta')) queries = '' for row in hits.itertuples(): if row.str == '1': f_start = int(row.start) - length f_end = int(row.start) elif row.str == '-1': f_start = int(row.end) f_end = int(row.end) + length try: x = prom_dict[row.contig][f_start:f_end] x.id = f'{row.query}:{str(f_start)}' queries = queries + x.format("fasta") except: continue mast_out = run_mast(queries, furpath + '/fur.meme').rstrip() results = [] for line in (line for line in mast_out.split('\n') if not line.startswith('#')): results.append(line) if len(results) == 0: print("No binding sites found") return hits else: print(f'Putative TFBS for {len(results)} hit(s)') mast = [] for q in results: query = q.split(' ')[0].split(':')[0] fur_start = int(q.split(' ')[0].split(':')[1]) + int( q.split(' ')[4]) fur_end = int(q.split(' ')[0].split(':')[1]) + int(q.split(' ')[5]) pval = str(q.split(' ')[8]) bs = prom_dict[query.rsplit('_', 1)[0]][fur_start:fur_end].seq._data if q.split(' ')[1] == '-1': bs = str(Seq(bs).reverse_complement()) mast.append(f'{query}#{str(fur_start)}#{str(fur_end)}#{pval}#{bs}') df = pd.DataFrame( [sub.split("#") for sub in mast], columns=['query', 'fur_start', 'fur_end', 'p_value', 'fur_box']) return hits.merge(df, on='query', how='left')
def matchExonerateCdhit(fasta, flanks, taxon, output): """ my %opts = { fasta => "*cdhit.exonerate", flanks => "*.cdhit", taxon => $core } """ # read in sequences input = open(fasta, 'r').readlines() input = [ i for i in input if not re.findall('(Command|Hostname|exonerate)', i) ] input = io.StringIO("".join(input)) records = list(parse(input, "fasta")) # filter just the sequences in the correct orientation # (known from the reference sequence used with exonerate) oriented = [] for record in records: coordinates = record.description.split("\t")[1] start = int(coordinates.split("-")[0]) end = int(coordinates.split("-")[1]) if end > start: oriented.append(record) WSdict = to_dict(parse(flanks, "fasta")) genomic = [] for sequence in oriented: genomicSeq = WSdict[sequence.id] if sequence.seq in genomicSeq.seq: newRecord = SeqRecord(genomicSeq.seq, id=taxon, description='') genomic.append(newRecord) else: genomicSeqRev = genomicSeq.seq.reverse_complement() newRecord = SeqRecord(genomicSeqRev, id=taxon, description='') genomic.append(newRecord) write(genomic, output, "fasta")
from pandas import read_csv, DataFrame from sys import argv from utils import find_gtaa_break, pardir HEAD_LEN = 100 if __name__ == '__main__': #======================== LEITURA ========================# print('Lendo alinhamentos filtrados do Perere3...') inpath = str(pardir / 'alinhamentos/filtered_perere3_vs_genoma.bl') filtered_perere3_vs_genoma = read_csv(inpath, sep='\\s+') print(f"'{inpath}' lido.") print('Lendo genoma de S. mansoni...') genomedict = to_dict(parse(str(pardir / 'seqs/smgenome.fa'), 'fasta')) print('Dicionário criado.') print('Abrindo arquivos de output e anotação...') heads_annotations_path = (pardir / 'genome_annotation' / 'head_annotations.gff3') heads_outpath = (pardir / 'seqs/heads.fa') heads_annotations_file = heads_annotations_path.open('w') heads_outfile = heads_outpath.open('w') #======================== GET HEADS ========================# print('Buscando as sequências head no genoma...')
def flank_screen(in_file, hits, flankpath, cds, threads): cds_dict = to_dict(parse(StringIO(in_file), 'fasta')) queries = '' for h in hits['query'].tolist(): pos = int(h.split('_')[-1]) acc = f'{h.rsplit("_", 1)[0]}_' for i in range(cds): i = i + 1 try: u = cds_dict[acc + str(pos - i)] if len(u.seq) > 2000: print( f'[!] Warning: {u.id} is >2000aa and might cause blastp to hang' ) u.id = h u.description = f'{i} upstream' queries = queries + u.format("fasta") except: continue try: d = cds_dict[acc + str(pos + i)] if len(d.seq) > 2000: print( f'[!] Warning: {d.id} is >2000aa and might cause blastp to hang' ) d.id = h d.description = f'{i} downstream' queries = queries + d.format("fasta") except: continue hit_list = [] # For each IROMP flanking protein for q in run_blastp(queries, flankpath + '/flankdb', '1e-50', '5', threads): # If the protein has a hit if len(q.hits) > 0: out = q.hsps[0].hit_description if out.startswith('('): gene_name = out.split('(', 1)[1].split(')')[0] description = out.split(')', 1)[1].split('[', 1)[0] elif '[' in out: gene_name = '' description = out.split('[', 1)[0] else: try: gene_name = out.split('GN=', 1)[1].split('PE=')[0] except IndexError: gene_name = '' try: description = out.split('OS=')[0] except IndexError: description = '' hit_list.append(f'{q.hsps[0].query_id}#' f'{q.hsps[0].query_description}: ' f'{gene_name.strip()} ' f'{description.strip()}\n') print(f'{len(hit_list)} flanking genes identified') hit_df = pd.DataFrame([sub.split("#") for sub in hit_list], columns=['query', 'flanking_genes']) hit_df = hit_df.groupby([ 'query' ])['flanking_genes'].apply(lambda x: ''.join(x.astype(str))).reset_index() hit_df['flanking_genes'] = hit_df['flanking_genes'].str.strip() return hits.merge(hit_df, on='query', how='left')
def main(): u.log(f'{__file__}: Generating heads of {HEAD_LEN} bp.') truncated_count = 0 #======================== LEITURA ========================# heads_annotations_file = u.safe_open(heads_annotations_path, exist_ok=False) heads_outfile = u.safe_open(heads_outpath, exist_ok=False) motherlength_outfile = u.safe_open(motherlength_path, exist_ok=False) print('Lendo alinhamentos filtrados do Perere3...', end=' ') filtered_perere3_vs_genoma = read_table(inpath) print(f"'{inpath}' lido.") print('Lendo genoma de S. mansoni...', end=' ') genomedict = to_dict(parse(str(u.genome_path), 'fasta')) print('Dicionário criado.') #======================== GET HEADS ========================# print('Searching for Perere-3 copies in S. mansoni\'s genome...') with (u.pardir / 'seqs/perere3.fa').open() as per_file: perere_len = len(''.join([l.strip() for l in per_file.readlines()][1:])) heads = [] for index, row in filtered_perere3_vs_genoma.iterrows(): # Discard copies without 3' end. if abs(row['qend'] - perere_len) < MAX_DISTANCE_FROM_END: genome_piece = genomedict[row['saccver']].seq plus_sense = row['sstart'] < row['send'] if plus_sense: head_slice = slice(row['send'], row['send'] + GTAA_WINDOW_LEN) proto_head = genome_piece[head_slice] if u.verbose: prefix = genome_piece[head_slice.start - PREFIX_LEN:head_slice.start] else: head_slice = slice(row['send'] - GTAA_WINDOW_LEN - 1, row['send'] - 1) proto_head = genome_piece[head_slice].reverse_complement() if u.verbose: prefix = genome_piece[head_slice.stop:head_slice.stop + PREFIX_LEN].reverse_complement() ###### Algumas dão xabu (???) if head_slice.start < 0 or head_slice.stop < 0: u.prinf(f'Head descartada com posições:', head_slice) continue skip_gtaa = find_gtaa_break(proto_head) head = proto_head[skip_gtaa:skip_gtaa + HEAD_LEN] #======================== ANOTAR HEAD NO GFF ========================# if plus_sense: start_pos = row['send'] + 1 + skip_gtaa end_pos = start_pos + HEAD_LEN else: end_pos = row['send'] - 1 - skip_gtaa start_pos = end_pos - HEAD_LEN heads_annotations_file.write('\t'.join([ row['saccver'], 'WormBase_imported', 'gene', str(start_pos), str(end_pos), '.', ['-', '+'][plus_sense], '.', f'gene_id=head{index};motherlength={row["length"]};length={HEAD_LEN}' ]) + '\n') motherlength_outfile.write(f"head{index}\t{row['length']}\n") #======================== ESCREVER HEAD EM FASTA ========================# heads_outfile.write(f'>head{index}\n' + str(head) + '\n') #========================================================================# if u.verbose: print( ['-', '+'][plus_sense], prefix, proto_head[:skip_gtaa] + ' | ' + head[:30 - skip_gtaa] + '...', f" {len(head)}bp\t{row['pident']:.2f}%\t{row['evalue']:.2e}\t{row['bitscore']:5}\t{row['saccver']}\t{head_slice.start}-{head_slice.stop}" ) else: truncated_count += 1 u.log( f'\n{filtered_perere3_vs_genoma.shape[0] - truncated_count} heads written:', heads_outpath, heads_annotations_path, sep='\n\t') u.log(f'{filtered_perere3_vs_genoma.shape[0]} alignments considered.') u.log(truncated_count, 'heads discarded as truncated.') heads_annotations_file.close() heads_outfile.close() motherlength_outfile.close() #======================== PLOTAR HISTOGRAMAS ========================# if u.plot_flag: heads_df = DataFrame.from_dict(dict(enumerate(zip(*heads)))) for j in range(8): plt.figure(figsize=(16, 9)) for i in range(12): plt.subplot(3, 4, i + 1) heads_df[i + j * 12].value_counts().plot(kind='bar') plt.show()