def main():
    start_time = time()

    parser = cmd_parse()  # Parsing of command line arguments

    with open(parser['file']) as genome:
        fasta_genome = to_dict(parse(genome, 'fasta'))  # Reading genome file

    jobs = parser['jobs']  # Number of processes to parallelize
    fragments = parser['fragments_num']  # Number of fragments to get
    frags_per_core = [fragments // jobs] * (jobs - 1)
    frags_per_core.append(fragments - sum(frags_per_core))  # Number of fragment to get from one process

    dis_file = parser['dis_file']  # Address of empirical distribution file
    emp_dis = rfd(dis_file) if dis_file is not None else None  # Empirical distribution reading
    my_seed = parser['seed']  # Numpy seeding argument

    processes = []  # List of processes to parallelize
    for job, fragments_num in enumerate(frags_per_core):
        seeding = ((my_seed + job) % MAX_SEED if my_seed != -1 else my_seed) if my_seed is not None else None
        # Processing of seeding argument
        processes.append(Process(target=disassembler,
                                 args=(fasta_genome, parser['seq_type'], fragments_num, parser['out_file'],
                                       parser['depth'], parser['read_len'], job, seeding, emp_dis, parser['mean_len'])))
        processes[-1].start()
    for process in processes:
        process.join()
    print_verbose(
        'The program completed disassembling without any errors. Elapsed time={:f}'.format(time() - start_time),
        parser['session_id'], parser['logfile'], parser['verbose'], parser['params'])  # Parameters logging
Exemplo n.º 2
0
def domain_filter(in_file, iromppath, lowqual, threads):
    q = run_hmmsearch(in_file, iromppath + '/PF07715.hmm', threads)
    if not q:
        return []
    in_file_dict = to_dict(parse(StringIO(in_file), 'fasta'))
    filter1 = ''
    for h in q.hit_keys:
        filter1 += in_file_dict[h].format("fasta")
    print(f'Filtered {str(filter1.count(">"))} proteins with {q.description}')
    if lowqual is True or len(filter1) == 0:
        return filter1
    else:
        q = run_hmmsearch(filter1, iromppath + '/PF00593.hmm', threads)
        filter2 = ''
        filter1_dict = to_dict(parse(StringIO(filter1), 'fasta'))
        for h in q.hit_keys:
            filter2 += filter1_dict[h].format("fasta")
        print(
            f'Filtered {str(filter2.count(">"))} proteins with {q.description}'
        )
        return filter2
Exemplo n.º 3
0
def export_proteins(in_file, out_file, name, hits):
    to_write = []
    in_file_dict = to_dict(parse(StringIO(in_file), 'fasta'))
    for row in hits.itertuples():
        r = in_file_dict[row.query]
        r.id = row.query
        r.description = f'{row.description} {row.hit}'
        to_write.append(r)
    if out_file == '':
        if name == '-':
            out_file = 'stdin_sideroscanner.faa'
        else:
            out_file = f'{name}_sideroscanner.faa'
    write(to_write, out_file, "fasta")
    print(f'[-] Proteins written to: {out_file}')
Exemplo n.º 4
0
def tfbs_screen(in_file, furpath, hits, length):
    prom_dict = to_dict(parse(StringIO(in_file), 'fasta'))
    queries = ''
    for row in hits.itertuples():
        if row.str == '1':
            f_start = int(row.start) - length
            f_end = int(row.start)
        elif row.str == '-1':
            f_start = int(row.end)
            f_end = int(row.end) + length
        try:
            x = prom_dict[row.contig][f_start:f_end]
            x.id = f'{row.query}:{str(f_start)}'
            queries = queries + x.format("fasta")
        except:
            continue

    mast_out = run_mast(queries, furpath + '/fur.meme').rstrip()
    results = []
    for line in (line for line in mast_out.split('\n')
                 if not line.startswith('#')):
        results.append(line)
    if len(results) == 0:
        print("No binding sites found")
        return hits
    else:
        print(f'Putative TFBS for {len(results)} hit(s)')
        mast = []
        for q in results:
            query = q.split(' ')[0].split(':')[0]
            fur_start = int(q.split(' ')[0].split(':')[1]) + int(
                q.split(' ')[4])
            fur_end = int(q.split(' ')[0].split(':')[1]) + int(q.split(' ')[5])
            pval = str(q.split(' ')[8])
            bs = prom_dict[query.rsplit('_',
                                        1)[0]][fur_start:fur_end].seq._data
            if q.split(' ')[1] == '-1':
                bs = str(Seq(bs).reverse_complement())
            mast.append(f'{query}#{str(fur_start)}#{str(fur_end)}#{pval}#{bs}')
        df = pd.DataFrame(
            [sub.split("#") for sub in mast],
            columns=['query', 'fur_start', 'fur_end', 'p_value', 'fur_box'])
        return hits.merge(df, on='query', how='left')
Exemplo n.º 5
0
def matchExonerateCdhit(fasta, flanks, taxon, output):
    """ 
	my %opts = { fasta  => "*cdhit.exonerate",
	    		 flanks => "*.cdhit",
				 taxon  => $core }
	"""

    # read in sequences
    input = open(fasta, 'r').readlines()
    input = [
        i for i in input if not re.findall('(Command|Hostname|exonerate)', i)
    ]
    input = io.StringIO("".join(input))
    records = list(parse(input, "fasta"))

    # filter just the sequences in the correct orientation
    # (known from the reference sequence used with exonerate)
    oriented = []

    for record in records:
        coordinates = record.description.split("\t")[1]
        start = int(coordinates.split("-")[0])
        end = int(coordinates.split("-")[1])
        if end > start:
            oriented.append(record)

    WSdict = to_dict(parse(flanks, "fasta"))
    genomic = []

    for sequence in oriented:
        genomicSeq = WSdict[sequence.id]
        if sequence.seq in genomicSeq.seq:
            newRecord = SeqRecord(genomicSeq.seq, id=taxon, description='')
            genomic.append(newRecord)
        else:
            genomicSeqRev = genomicSeq.seq.reverse_complement()
            newRecord = SeqRecord(genomicSeqRev, id=taxon, description='')
            genomic.append(newRecord)

    write(genomic, output, "fasta")
Exemplo n.º 6
0
from pandas import read_csv, DataFrame
from sys import argv
from utils import find_gtaa_break, pardir

HEAD_LEN = 100

if __name__ == '__main__':
    #======================== LEITURA ========================#

    print('Lendo alinhamentos filtrados do Perere3...')
    inpath = str(pardir / 'alinhamentos/filtered_perere3_vs_genoma.bl')
    filtered_perere3_vs_genoma = read_csv(inpath, sep='\\s+')
    print(f"'{inpath}' lido.")

    print('Lendo genoma de S. mansoni...')
    genomedict = to_dict(parse(str(pardir / 'seqs/smgenome.fa'), 'fasta'))
    print('Dicionário criado.')

    print('Abrindo arquivos de output e anotação...')

    heads_annotations_path = (pardir / 'genome_annotation' /
                              'head_annotations.gff3')
    heads_outpath = (pardir / 'seqs/heads.fa')

    heads_annotations_file = heads_annotations_path.open('w')
    heads_outfile = heads_outpath.open('w')

    #======================== GET HEADS ========================#

    print('Buscando as sequências head no genoma...')
Exemplo n.º 7
0
def flank_screen(in_file, hits, flankpath, cds, threads):
    cds_dict = to_dict(parse(StringIO(in_file), 'fasta'))
    queries = ''
    for h in hits['query'].tolist():
        pos = int(h.split('_')[-1])
        acc = f'{h.rsplit("_", 1)[0]}_'
        for i in range(cds):
            i = i + 1
            try:
                u = cds_dict[acc + str(pos - i)]
                if len(u.seq) > 2000:
                    print(
                        f'[!] Warning: {u.id} is >2000aa and might cause blastp to hang'
                    )
                u.id = h
                u.description = f'{i} upstream'
                queries = queries + u.format("fasta")
            except:
                continue
            try:
                d = cds_dict[acc + str(pos + i)]
                if len(d.seq) > 2000:
                    print(
                        f'[!] Warning: {d.id} is >2000aa and might cause blastp to hang'
                    )
                d.id = h
                d.description = f'{i} downstream'
                queries = queries + d.format("fasta")
            except:
                continue

    hit_list = []
    # For each IROMP flanking protein
    for q in run_blastp(queries, flankpath + '/flankdb', '1e-50', '5',
                        threads):
        # If the protein has a hit
        if len(q.hits) > 0:
            out = q.hsps[0].hit_description
            if out.startswith('('):
                gene_name = out.split('(', 1)[1].split(')')[0]
                description = out.split(')', 1)[1].split('[', 1)[0]
            elif '[' in out:
                gene_name = ''
                description = out.split('[', 1)[0]
            else:
                try:
                    gene_name = out.split('GN=', 1)[1].split('PE=')[0]
                except IndexError:
                    gene_name = ''
                try:
                    description = out.split('OS=')[0]
                except IndexError:
                    description = ''
            hit_list.append(f'{q.hsps[0].query_id}#'
                            f'{q.hsps[0].query_description}: '
                            f'{gene_name.strip()} '
                            f'{description.strip()}\n')

    print(f'{len(hit_list)} flanking genes identified')

    hit_df = pd.DataFrame([sub.split("#") for sub in hit_list],
                          columns=['query', 'flanking_genes'])

    hit_df = hit_df.groupby([
        'query'
    ])['flanking_genes'].apply(lambda x: ''.join(x.astype(str))).reset_index()
    hit_df['flanking_genes'] = hit_df['flanking_genes'].str.strip()
    return hits.merge(hit_df, on='query', how='left')
Exemplo n.º 8
0
def main():
    u.log(f'{__file__}: Generating heads of {HEAD_LEN} bp.')
    truncated_count = 0
    #======================== LEITURA ========================#
    heads_annotations_file = u.safe_open(heads_annotations_path,
                                         exist_ok=False)
    heads_outfile = u.safe_open(heads_outpath, exist_ok=False)
    motherlength_outfile = u.safe_open(motherlength_path, exist_ok=False)

    print('Lendo alinhamentos filtrados do Perere3...', end=' ')
    filtered_perere3_vs_genoma = read_table(inpath)
    print(f"'{inpath}' lido.")

    print('Lendo genoma de S. mansoni...', end=' ')
    genomedict = to_dict(parse(str(u.genome_path), 'fasta'))
    print('Dicionário criado.')

    #======================== GET HEADS ========================#
    print('Searching for Perere-3 copies in S. mansoni\'s genome...')

    with (u.pardir / 'seqs/perere3.fa').open() as per_file:
        perere_len = len(''.join([l.strip()
                                  for l in per_file.readlines()][1:]))

    heads = []

    for index, row in filtered_perere3_vs_genoma.iterrows():

        # Discard copies without 3' end.
        if abs(row['qend'] - perere_len) < MAX_DISTANCE_FROM_END:
            genome_piece = genomedict[row['saccver']].seq
            plus_sense = row['sstart'] < row['send']

            if plus_sense:
                head_slice = slice(row['send'], row['send'] + GTAA_WINDOW_LEN)
                proto_head = genome_piece[head_slice]

                if u.verbose:
                    prefix = genome_piece[head_slice.start -
                                          PREFIX_LEN:head_slice.start]

            else:
                head_slice = slice(row['send'] - GTAA_WINDOW_LEN - 1,
                                   row['send'] - 1)
                proto_head = genome_piece[head_slice].reverse_complement()

                if u.verbose:
                    prefix = genome_piece[head_slice.stop:head_slice.stop +
                                          PREFIX_LEN].reverse_complement()

            ###### Algumas dão xabu (???)
            if head_slice.start < 0 or head_slice.stop < 0:
                u.prinf(f'Head descartada com posições:', head_slice)
                continue

            skip_gtaa = find_gtaa_break(proto_head)
            head = proto_head[skip_gtaa:skip_gtaa + HEAD_LEN]

            #======================== ANOTAR HEAD NO GFF ========================#

            if plus_sense:
                start_pos = row['send'] + 1 + skip_gtaa
                end_pos = start_pos + HEAD_LEN

            else:
                end_pos = row['send'] - 1 - skip_gtaa
                start_pos = end_pos - HEAD_LEN

            heads_annotations_file.write('\t'.join([
                row['saccver'], 'WormBase_imported', 'gene',
                str(start_pos),
                str(end_pos), '.', ['-', '+'][plus_sense], '.',
                f'gene_id=head{index};motherlength={row["length"]};length={HEAD_LEN}'
            ]) + '\n')

            motherlength_outfile.write(f"head{index}\t{row['length']}\n")

            #======================== ESCREVER HEAD EM FASTA ========================#
            heads_outfile.write(f'>head{index}\n' + str(head) + '\n')
            #========================================================================#

            if u.verbose:
                print(
                    ['-', '+'][plus_sense], prefix, proto_head[:skip_gtaa] +
                    ' | ' + head[:30 - skip_gtaa] + '...',
                    f" {len(head)}bp\t{row['pident']:.2f}%\t{row['evalue']:.2e}\t{row['bitscore']:5}\t{row['saccver']}\t{head_slice.start}-{head_slice.stop}"
                )

        else:
            truncated_count += 1

    u.log(
        f'\n{filtered_perere3_vs_genoma.shape[0] - truncated_count} heads written:',
        heads_outpath,
        heads_annotations_path,
        sep='\n\t')
    u.log(f'{filtered_perere3_vs_genoma.shape[0]} alignments considered.')
    u.log(truncated_count, 'heads discarded as truncated.')

    heads_annotations_file.close()
    heads_outfile.close()
    motherlength_outfile.close()

    #======================== PLOTAR HISTOGRAMAS ========================#

    if u.plot_flag:
        heads_df = DataFrame.from_dict(dict(enumerate(zip(*heads))))

        for j in range(8):
            plt.figure(figsize=(16, 9))
            for i in range(12):
                plt.subplot(3, 4, i + 1)
                heads_df[i + j * 12].value_counts().plot(kind='bar')

            plt.show()