Exemplo n.º 1
0
def make_query_mapping(query_fasta):
    mapping_dict = {}
    fasta_dict = parse_fasta(query_fasta)
    for ID in fasta_dict:
        new_ID = ID.split('|')[0]
        print(new_ID)
        mapping_dict[ID] = new_ID

    return mapping_dict
Exemplo n.º 2
0
def separate_ref_from_nonref(fasta_dir):
    ref_fasta_dict = {}
    nonref_fasta_dict = {}

    fasta_dict = parse_fasta(fasta_dir)
    for seq_id, sequence in fasta_dict.items():
        seq_id = LongFastaID(seq_id)
        if seq_id.genome_acc == REFACC:
            ref_fasta_dict[seq_id.protein_id] = sequence
        else:
            nonref_fasta_dict[seq_id.protein_id] = sequence

    return ref_fasta_dict, nonref_fasta_dict
Exemplo n.º 3
0
            query_res += 1

        if ref_aa != '-':
            ref_res += 1

        if aa != '-' and ref_aa != '-':
            mapping[query_res] = ref_res

    return mapping


if __name__ == "__main__":

    blast_results = parse_blast_output(BLAST_OUTPUT)

    id_to_sequence = parse_fasta(PDB_SEQS_STRUCTURE)
    ref_to_sequence = parse_fasta(REFERENCE_PROTEOME)

    ordered_blast_results = order_hits(blast_results)
    combined_blast_results = combine_all_hits(ordered_blast_results)
    best_hits = get_best_hits(combined_blast_results)

    for query, best_hit in best_hits.items():
        fasta_dict = {}
        ref_id = best_hit[1]
        fasta_dict[query] = id_to_sequence[query]
        fasta_dict[ref_id] = ref_to_sequence[ref_id]

        temp_fasta = f'{TEMP}{query}.fasta'
        write_fasta(fasta_dict, temp_fasta)
        temp_aligned = f'{TEMP}{query}_aligned.fasta'
Exemplo n.º 4
0
"""


if sys.argv[1] == '-p' or sys.argv[1] == '--preprocess':
    """ Preprocess only. """
    state = 'preprocess'
    genome_file = sys.argv[2]
    print('will preprocess', genome_file)
    #out_file = '.'.join(genome_file.split('/')[-1].split('.')[0:-1]) + '.pickle' # isolate file name from path and extension.
    out_file = 'preprocessed_sequences_bw.pickle' # Use the same file name.


    dictionary = {} # Collects all the objects.

    for _i, genome in enumerate(parse_fasta(genome_file)):
        print('\t', _i, ': preprocessing ', genome['title'], sep = '')
        o = bwt.search_bwt(genome['title'], genome['sequence']) # One object for each genome.
        o.main_preprocess()
        dictionary[_i] = o

    
    # Save dictionary with objects of all sequences to disk with pickle.
    with open(out_file, 'wb') as file:
        pickle.dump(dictionary, file)
        print()
        print('Successfully saved to:')
        print()
        print('\t' + out_file)

    print()
Exemplo n.º 5
0
        

        if ref_aa != '-':
            ref_res += 1

        if aa != '-' and ref_aa != '-':
            mapping[query_res] = ref_res

    return mapping
            

if __name__ == "__main__":

    blast_results = parse_blast_output(BLAST_OUTPUT)

    id_to_sequence = parse_fasta(UNIQUE_SEQS)
    ref_to_sequence = parse_fasta(REFERENCE_PROTEOME)

    ordered_blast_results = order_hits(blast_results)
    combined_blast_results = combine_all_hits(ordered_blast_results)
    best_hits = get_best_hits(combined_blast_results)

    for query, best_hit in best_hits.items():
        fasta_dict = {}
        ref_id = best_hit[1]
        fasta_dict[query] = id_to_sequence[query]
        fasta_dict[ref_id] = ref_to_sequence[ref_id]
        
        temp_fasta = f'{TEMP}{query}.fasta'
        write_fasta(fasta_dict, temp_fasta)
        temp_aligned = f'{TEMP}{query}_aligned.fasta'
Exemplo n.º 6
0
from st import suffixtree
from parsers import parse_fasta, parse_fastq
import sys

genome_file = sys.argv[1]
reads_file = sys.argv[2]
    


for genome in parse_fasta(genome_file):
    
    for read in parse_fastq(reads_file):

        st = suffixtree(genome['sequence'])

        for match in st.find_positions(read['sequence']):

            print(f"\
{read['sequence']}\t\
0\t\
{genome['title']}\t\
{match+1}\t\
0\t\
{len(read['sequence'])}M\t\
*\t\
0\t\
0\t\
{read['sequence']}\t\
{len(read['sequence'])*'~'}")

Exemplo n.º 7
0
    prot_ids = set([])

    for id in fasta_dict:
        prot_ids.add(parse_id_from_prot_file(id))

    return prot_ids


def extract_protids(fasta_dict):
    prot_ids = set([])

    for id, protdata in fasta_dict.items():
        prot_ids.add(protdata.protein_id.split('.')[0])

    return prot_ids


if __name__ == "__main__":
    used_fasta = argv[1]
    other_fasta = argv[2]

    fasta_dict_1 = parse_fasta(used_fasta)
    fasta_dict_2 = parse_fasta_simple(other_fasta)

    prot_ids_1 = extract_protids(fasta_dict_1)
    prot_ids_2 = extract_protids_simple(fasta_dict_2)

    print("Extra in used", prot_ids_1 - prot_ids_2)
    for ID in (prot_ids_2 - prot_ids_1):
        print(ID)
from writers import write_fasta
from sys import argv


def get_refseqs(refseq_to_uniprot):
    refseqs = set([])
    for refseq in refseq_to_uniprot:
        refseqs.add(refseq.split('.')[0])

    return refseqs


if __name__ == "__main__":
    fasta = argv[1]
    refseqs = argv[2]

    refseq_to_uniprot = parse_mapping(refseqs)
    refseqs = get_refseqs(refseq_to_uniprot)

    fasta_dict = parse_fasta(fasta)
    refseq_to_seq = {}

    for fasta_id, sequence in fasta_dict.items():
        fasta_id = fasta_id.split('|')[0]
        print(fasta_id)
        fasta_id = fasta_id.strip()
        if fasta_id in refseqs:
            refseq_to_seq[fasta_id] = sequence

    write_fasta(refseq_to_seq, 'reference_proteome_complete.fasta')
Exemplo n.º 9
0
    sequence_to_id = {}
    for fasta_id, sequence in fasta_dict.items():
        fasta_id = parse_fasta_id(fasta_id)
        if not sequence in sequence_to_id:
            sequence_to_id[sequence] = []

        sequence_to_id[sequence].append(fasta_id)

    return sequence_to_id


def assign_code(sequence_to_id):
    code_to_sequence = {}
    code_to_accession = {}
    for i, (sequence, accessions) in enumerate(sequence_to_id.items()):
        code = 'seq_%.4d' % i
        code_to_sequence[code] = sequence
        code_to_accession[code] = accessions

    return code_to_sequence, code_to_accession


if __name__ == "__main__":
    fasta = argv[1]

    id_to_sequence = parse_fasta(fasta)
    sequence_to_id = reverse_fasta_dict(id_to_sequence)
    code_to_sequence, code_to_accession = assign_code(sequence_to_id)
    write_fasta(code_to_sequence, UNIQUE_SEQ_DIR)
    write_code_to_accession(code_to_accession, CODE_DIR)
Exemplo n.º 10
0
def make_new_fasta_dict(seq_to_id):
    new_fasta_dict = {}
    for seq, seq_id in seq_to_id.items():
        new_fasta_dict[seq_id] = seq

    return new_fasta_dict
    
            
            
if __name__ == "__main__":
    blast_output = argv[1]
    covid19_fasta = argv[2]
    orf_mapping = argv[3]

    covid19_fasta_dict = parse_fasta(covid19_fasta)
    orf_mapping = parse_mapping(orf_mapping)

    queryid_to_hits = parse_blast_output(blast_output)
    sorted_hit_dict = order_hits(queryid_to_hits)
    combined_hit_dicts = {}
    for query, subject_to_hits in sorted_hit_dict.items():
        combined_hit_dict = combine_hits(subject_to_hits)
        combined_hit_dicts[query] = combined_hit_dict

    filtered_hits, identical_hits, rejected_hits = filter_hits(combined_hit_dicts, covid19_fasta_dict)
 #   print(identical_hits.items())
    
    subject_to_queries = map_subject_to_queries(filtered_hits)

    fasta_dicts = make_fasta_dicts(subject_to_queries, covid19_fasta_dict, orf_mapping)
def run_blastp(in_file):
    subjects = REFERENCE_PROTEOME
    queries = in_file
    command = ['blastp', '-query', queries, '-subject', subjects, '-out',
               TEMP_BLAST, '-outfmt',
               "6 qseqid sseqid pident length mismatch qstart qend qlen sstart \
send slen evalue bitscore qcovs"]
    subprocess.check_call(command)

    

if __name__ = "__main__":
    fasta = argv[1]

    unique_id_to_seq = parse_fasta(UNIQUE_SEQ_DIR)
    unique_seq_to_id = reverse_fasta_dict(unique_id_to_seq)
    
    new_id_to_seq = parse_fasta(fasta)
    new_seq_to_id = reverse_fasta_dict(new_id_to_seq)

    for sequence in new_seq_to_id:
        if sequence in unique_seq_to_id:
            
            
            

    
    pass