Пример #1
0
def get_cm_model_table(query_file,
                       params=None,
                       threads=None,
                       rfam=None,
                       timeout=None):
    ml.debug(fname())
    if params is None:
        params = dict()

    cmscan_params = '-g '
    if params and ('cmscan' in params) and params['cmscan']:
        cmscan_params += params['cmscan']
    try:
        out_table = run_cmscan(query_file,
                               params=cmscan_params,
                               threads=threads,
                               rfam=rfam,
                               timeout=timeout)
        f = open(out_table, 'r')
        cmscan_data = parse_cmalign_infernal_table(f)
        f.close()
        remove_one_file_with_try(out_table)
        return cmscan_data
    except exceptions.CmscanException as e:
        return None
Пример #2
0
def extract_ref_from_cm(cm_file):
    ml.debug(fname())
    single_alig_file = run_cmemit(cm_file, params='-a -N 1')
    o = open(single_alig_file, 'r')
    salig = stockholm_read(o)
    o.close()

    remove_one_file_with_try(single_alig_file)

    if len(salig) != 1:
        raise AssertionError('File from cmemit does not have only one record in (not including reference).')

    # recode structure, return it
    ss = salig.column_annotations['SS_cons']
    # inserts = str(salig[0].seq)
    inserts = str(salig.column_annotations['RF'])

    gapchars = '.~'

    structure_list = []
    for i, j in zip(ss, inserts):
        if i in gapchars:
            continue
        structure_list.append(i)

    # recode structure list
    structure = cm_strucutre2br(''.join(structure_list))
    return structure
Пример #3
0
def _prepare_pictures(sub):
    pictureslist = []
    picfile = None
    for key in sub.letter_annotations.keys():
        np = dict()
        np['picname'] = key
        np['secondary_structure'] = sub.letter_annotations[key]

        try:
            picfile = run_rnaplot(seq=str(sub.seq),
                                  structure=sub.letter_annotations[key],
                                  format='svg')
            with open(picfile) as f:
                np['pic'] = "data:image/svg+xml;utf8," + f.read()

            pictureslist.append(np)

            remove_one_file_with_try(picfile)
        except RNAplotException:
            print("can't draw structure with RNAfold for {}.".format(sub.id))
        except FileNotFoundError:
            if picfile is not None:
                print('cannot remove file: {}, file not found'.format(picfile))
        except OSError:
            if picfile is not None:
                print('cannot remove file: {}, file is directory'.format(
                    picfile))

    return pictureslist
Пример #4
0
def run_rnaplot(seq, structure=None, format='svg', outfile=None):
    """
    run rnaplot in desired format
    if seq
    :param seq:
    :param structure:
    :return:
    """
    ml.debug(fname())
    if structure is None:
        sequence = str(seq.seq)
        structure = seq.letter_annotations['ss0']
    else:
        sequence = seq

    assert len(sequence) == len(structure)

    allowed_formats = {'ps', 'svg', 'gml', 'xrna'}
    if format not in allowed_formats:
        raise TypeError('Format can be only from {}.'.format(allowed_formats))

    fd, tmpfile = mkstemp(prefix='rba_', suffix='_08', dir=CONFIG.tmpdir)

    rnaname = tmpfile.split('/')[-1].split('\\')[-1]

    currdirr = os.getcwd()
    tmpdir = gettempdir()
    os.chdir(tmpdir)

    with os.fdopen(fd, 'w') as fh:
        fh.write('>{}\n{}\n{}\n'.format(rnaname, sequence, structure))

    cmd = '{} --output-format={} < {}'.format(
        shlex.quote('{}RNAplot'.format(CONFIG.viennarna_path)),
        shlex.quote(format), shlex.quote(tmpfile))
    ml.debug(cmd)
    with TemporaryFile(mode='w+', encoding='utf-8') as tmp:
        r = call(cmd, shell=True, stdout=tmp, stderr=tmp)
        if r:
            msgfail = 'Call to RNAplot failed.'
            ml.error(msgfail)
            os.chdir(currdirr)
            tmp.seek(0)
            details = tmp.read()
            ml.debug(details)
            raise exceptions.RNAplotException(msgfail, details)

        # output file is name of the sequence (in this case name of the file) + "_ss." + chosen format
        remove_one_file_with_try(tmpfile)

        plot_output_file = os.path.join(tmpdir, rnaname + '_ss.' + format)
        os.chdir(currdirr)
        if outfile is None:
            return plot_output_file
        else:
            shutil.move(os.path.join(tmpdir, plot_output_file), outfile)
            return outfile
Пример #5
0
def rnafold_prediction(fasta2predict, params=''):
    ml.debug(fname())
    fd, structure_output_file = mkstemp(prefix='rba_',
                                        suffix='_54',
                                        dir=CONFIG.tmpdir)
    os.close(fd)

    structure_output_file = rnafold_fasta(fasta2predict, structure_output_file,
                                          params)

    structures = read_seq_str(structure_output_file)
    remove_one_file_with_try(structure_output_file)
    return structures
Пример #6
0
def extract_ref_structure_fromRFAM_CM(model_name):
    """
    Extract reference structure encoded in covariance model to dot bracket notation.
    :param model_name: model name in cm file
    :return: string
    """
    ml.debug(fname())
    rfam = RfamInfo()

    single_cm_file = run_cmfetch(rfam.file_path, model_name)

    ref_structure = extract_ref_from_cm(single_cm_file)
    remove_one_file_with_try(single_cm_file)
    return ref_structure
Пример #7
0
def _run_hybrid_ss_min_wrapper(seq, P, W, M):
    fd, tmp_fasta = mkstemp(prefix='rba_', suffix='_06', dir=CONFIG.tmpdir)
    try:
        with os.fdopen(fd, 'w') as fid:
            fid.write('>{}\n{}\n'.format(seq.id, str(seq.seq)))

        predicted_ss = _run_hybrid_ss_min_single(tmp_fasta, P, W, M)
        return predicted_ss[0]

    except exceptions.HybridssminException as e:
        return None

    finally:
        remove_one_file_with_try(tmp_fasta)
Пример #8
0
def rnafoldc(seqr, constraints_id='cons'):
    """
    predict mfe structure with rnafoldc
    :param seqr:
    :param constraints_id:
    :return:
    """

    fd, tmpf = mkstemp(prefix='rba_', suffix='_31', dir=CONFIG.tmpdir)
    with os.fdopen(fd, 'w') as fh:
        fh.write('>seq01\n{}\n{}\n'.format(
            str(seqr.seq), seqr.letter_annotations[constraints_id]))
    structure = rnafold_prediction(tmpf, params='-C')
    remove_one_file_with_try(tmpf)
    return structure[0].letter_annotations['ss0']
Пример #9
0
def run_clustal_profile2seqs_align(msa_file,
                                   fasta_seq_file,
                                   clustalo_params='',
                                   outfile=None):
    """
    run clustal align MSA to seqs
    aligned columns in input MSA file are preserved and only new sequences are aligned and together they form new
     alignment
    :param msa_file: msa file (works with stockholm)
    :param fasta_seq_file: file with sequences to be aligned (format can be enforced with --infmt in clustalo_params)
    :param clustalo_params: params as accepted by clustalo
    :param outfile: outfile path, if not provided, tempfile will be created with output
    :return: outfile MSA path
    """
    ml.info('Runing clustalo profile.')
    ml.debug(fname())

    def _try_rescue(profile_file):
        # beware AlignIO truncates sequence names so they become non-unique, then clustalo also fails
        ml.warning(
            'Trying rescue for profile alignment if profile has no gaps, sequences appears not aligned. '
            'Appending trailing gap to overcome the issue.')
        a = AlignIO.read(profile_file, format='clustal')
        s = [SeqRecord(Seq(str(i.seq) + '-'), id=i.id) for i in a]
        fa = AlignIO.MultipleSeqAlignment(s)

        fd, temp = mkstemp(prefix='rba_', suffix='_56', dir=CONFIG.tmpdir)
        with os.fdopen(fd, 'w') as fh:
            AlignIO.write(fa, fh, format='fasta')
        return temp

    if outfile:
        clustalo_file = outfile
    else:
        c_fd, clustalo_file = mkstemp(prefix='rba_',
                                      suffix='_57',
                                      dir=CONFIG.tmpdir)
        os.close(c_fd)

    with TemporaryFile(mode='w+', encoding='utf-8') as tmp:
        cmd = [
            '{}clustalo'.format(CONFIG.clustal_path), '--force', '-i',
            fasta_seq_file, '--profile1', msa_file, '-o', clustalo_file
        ]
        if clustalo_params != '':
            cmd += clustalo_params.split()

        ml.debug(cmd)
        r = call(cmd, stdout=tmp, stderr=tmp)

        if r:
            ml.warning('Profile align failed.')

            # Initiate rescue attempt
            rewriten_msa = _try_rescue(msa_file)
            cmd2 = [
                '{}clustalo'.format(CONFIG.clustal_path), '--force', '-i',
                fasta_seq_file, '--profile1', rewriten_msa, '-o', clustalo_file
            ]
            if clustalo_params:
                cmd2 += clustalo_params.split()

            ml.debug(cmd2)
            r2 = call(cmd2, stdout=tmp, stderr=tmp)

            remove_one_file_with_try(rewriten_msa)

            if r2 != 0:
                msgfail = 'Call to clustalo for aligning profile to sequences failed.'
                ml.error(msgfail)
                ml.error(cmd)
                ml.error(cmd2)
                raise exceptions.ClustaloException(msgfail, tmp.read())
    return clustalo_file
Пример #10
0
def cmmodel_rnafold_c(allhits_fasta, cmmodel_file, threads=None, params=None):
    ml.debug(fname())
    if params is None:
        params = dict()

    allhits_fasta_file, san_dict = sanitize_fasta_file(allhits_fasta)

    cmalign_params = ''
    if threads:
        cmalign_params += '--cpu {}'.format(threads)

    if 'cmalign' in params and params['cmalign']:
        cmalign_params += ' ' + params['cmalign']

    if '--notrunc' not in cmalign_params:
        cmalign_params += ' --notrunc'

    # rnafold params
    rnafold_params = params.get('RNAfold', '-C')
    assert isinstance(rnafold_params,
                      str), "Incorrect parameters for RNAfold -C"
    if '-C' not in rnafold_params:
        # some parameters given but -C not present
        rnafold_params += ' -C'

    alig_file = run_cmalign_on_fasta(allhits_fasta_file,
                                     cmmodel_file,
                                     cmalign_params=cmalign_params)
    # multiple sequence cm align
    # split by sequence, then run the rest
    cm_alig = read_st(alig_file)

    remove_files_with_try([allhits_fasta_file, alig_file])

    structures = []
    for single_alig in trim_cmalign_sequence_by_refseq_one_seq(
            cm_alig, rs='SS_cons', convert2uppercase=True):
        out_alig, trimmed_seq = trim_and_repair_single_cm_alignment(
            single_alig)
        conserved_structure_pairs = find_nc_and_remove(
            str(trimmed_seq.seq), trimmed_seq.letter_annotations['dec_str'])
        trimmed_seq.letter_annotations[
            'constrains'] = conserved_structure_pairs

        # constraint prediction
        # write constraint file
        fd, temp_constraint_file = mkstemp(prefix='rba_',
                                           suffix='_41',
                                           dir=CONFIG.tmpdir)
        with os.fdopen(fd, 'w') as tmpf:
            tmpf.write('>{}\n{}\n{}\n'.format(
                trimmed_seq.id, str(trimmed_seq.seq),
                trimmed_seq.letter_annotations['constrains']))

        single_structure = rnafold_prediction(temp_constraint_file,
                                              params=rnafold_params)

        remove_one_file_with_try(temp_constraint_file)

        # trimmed_seq.letter_annotations['final'] = single_structure[0].letter_annotations['ss0']
        structures.append(single_structure[0])

    str_out = desanitize_fasta_names_in_seqrec_list(structures, san_dict)

    return str_out
Пример #11
0
def alifold_refold_prediction(nr_homologs_hits_fasta,
                              all_hits_fasta,
                              refold='refold',
                              threads=None,
                              params=None,
                              msa_alg='clustalo'):
    """
    return predicted structures for all hits based on provided sequence homologs
    ! beware, clustal mixes order of sequences in profile alignment, correct for it
    possible param keys: "clustal", "alifold", "clustalo_profile", "repred_unpaired_tr"
    """
    ml.debug(fname())
    nr_path, san_dict = sanitize_fasta_file(nr_homologs_hits_fasta)
    all_path, san_dict = sanitize_fasta_file(all_hits_fasta,
                                             used_dict=san_dict)

    if params is None:
        params = dict()

    ref_pred = ['refold', 'refold_rnafoldc', 'conserved_ss_rnafoldc']
    if refold not in ref_pred:
        raise Exception(
            'refold procedure not recognized: {}, possible values are {}'.
            format(refold, ' '.join(ref_pred)))

    cl_file = _aligner_block(nr_path, params, msa_alg, threads)

    # cannot rely on that, the order of a cl_file would be the same as the order of the nr_homolog_hits_file
    ali_file = compute_alifold(cl_file,
                               alifold_params=params.get('alifold', ''))

    consensus_record = read_seq_str(ali_file)[0]

    clustalo_profile_params = '--outfmt clustal '
    clustalo_profile_params += params.get('clustalo_profile', '')
    if threads:
        clustalo_profile_params += ' --threads {}'.format(threads)
    realign_file = run_clustal_profile2seqs_align(
        cl_file, all_path, clustalo_params=clustalo_profile_params)
    realign_alig = AlignIO.read(realign_file, format='clustal')

    # slice alignment ( get seqname from nr_homolog_hits_file, find it in the realign and slice the whole segment off
    #  take care that the id may be the same and it must be checked for multiple occurence

    first_nr_record = _parse_first_record_only(nr_path)

    realign_allseq_possition = [
        i for i, seq in enumerate(realign_alig) if seq.id == first_nr_record.id
    ]

    new_alig_for_refold = realign_alig[:realign_allseq_possition[-1]]
    old_alig_in_new = realign_alig[realign_allseq_possition[-1]:]

    orig_alignment = AlignIO.read(cl_file, format='clustal')

    first_original_alignment_record = orig_alignment[0]

    match_original_seq_in_new_alig = [
        i for i in old_alig_in_new
        if i.id == first_original_alignment_record.id
    ][0]

    mapping = _map_alignment_columns_from_profile_match(
        first_original_alignment_record, match_original_seq_in_new_alig)

    # map and repair structure when mapping is unbiguous
    cs_encode = encode_structure_unicode(
        consensus_record.letter_annotations['ss0'])
    new_consensus_structure_encoded = _repair_consensus_structure_by_maping(
        cs_encode,
        mapping,
        len(match_original_seq_in_new_alig.seq),
        gap_char=49)
    new_consensus_structure_repaired = repair_structure_any_variant(
        new_consensus_structure_encoded)

    new_consensus_structure = decode_structure_unicode(
        new_consensus_structure_repaired)

    new_consensus_sequence = _repair_consensus_structure_by_maping(
        str(consensus_record.seq),
        mapping,
        len(match_original_seq_in_new_alig.seq),
        gap_char=ord('_'))

    # write new consensus to a file
    a_fd, new_alifold_consensus_file = mkstemp(prefix='rba_',
                                               suffix='_33',
                                               dir=CONFIG.tmpdir)
    with os.fdopen(a_fd, 'w') as f:
        f.write(new_consensus_sequence + '\n')
        f.write(new_consensus_structure + '\n')

    # write sliced alignment to a file
    sa_fd, sliced_alignment_file = mkstemp(prefix='rba_',
                                           suffix='_34',
                                           dir=CONFIG.tmpdir)
    with os.fdopen(sa_fd, 'w') as f:
        AlignIO.write(new_alig_for_refold, f, 'clustal')

    # now process the file, and map alignment to consensus structure
    if refold in ['refold', 'refold_rnafoldc']:
        refold_file = compute_refold(sliced_alignment_file,
                                     new_alifold_consensus_file)

        if refold == 'refold_rnafoldc':
            rnafold_parameters = params.get('RNAfold', '')
            if '-C' not in rnafold_parameters:
                rnafold_parameters += ' -C'

            seq_str = rnafold_prediction(refold_file,
                                         params=rnafold_parameters)

        else:
            seq_str = read_seq_str(refold_file)

        remove_one_file_with_try(refold_file)

    else:
        st_alig_file = build_stockholm_from_clustal_alig(
            sliced_alignment_file, new_alifold_consensus_file)
        repred_tr = str(params.get('repred_unpaired_tr', '9'))
        conseq_conserved = params.get('conseq_conserved', 1)

        seq_str = _refold_with_unpaired_conservation(
            st_alig_file,
            repred_tr=repred_tr,
            conseq_conserved=conseq_conserved)
        remove_one_file_with_try(st_alig_file)

    structures_out = desanitize_fasta_names_in_seqrec_list(seq_str, san_dict)

    remove_files_with_try([
        nr_path, all_path, sliced_alignment_file, new_alifold_consensus_file,
        cl_file, ali_file, realign_file
    ])

    return structures_out
Пример #12
0
def expand_hits_from_fasta(hits, database, query_length, extra=0, blast_regexp=None, skip_missing=False, msgs=None, format='fasta', entrez_email=None, blast_input_file=None):
    """takes list of blast.HSP objects and return extended sequences
    :return list of SeqRecord objects (parsed fasta file)
    """
    ml.info('Retrieving sequence neighborhoods for blast hits.')
    ml.debug(fname())

    if format == 'server':
        # conditional import so we don't need pysam for normal usage
        from rna_blast_analyze.BR_core.load_from_bgzip import GenomeDB
        seqdb = GenomeDB(database)

    if CONFIG.tmpdir is None:
        temp_entrez_file = os.path.join(
            gettempdir(), os.path.basename(blast_input_file + '.r-temp_entrez')
        )
    else:
        temp_entrez_file = os.path.join(
            CONFIG.tmpdir, os.path.basename(blast_input_file + '.r-temp_entrez')
        )

    if format == 'entrez':
        try:
            known_seq_index = SeqIO.index(temp_entrez_file, format='fasta')
            ml.info("File {} loaded.".format(temp_entrez_file))
            if len(known_seq_index) == 0:
                remove_one_file_with_try(temp_entrez_file)
        except FileNotFoundError:
            # ignore that we don't have that file (usual)
            known_seq_index = {}
        except Exception as e:
            ml.info("Could not load the temporary file {}.".format(temp_entrez_file))
            known_seq_index = {}
    else:
        known_seq_index = {}

    exp_hits = []
    strand = []
    for index, hit in enumerate(hits):
        # +1 here because blastdbcmd counts sequences from 1
        if hit[1].sbjct_end < hit[1].sbjct_start:
            # this is hit to minus strand
            start = hit[1].sbjct_end - _positive_index(query_length - hit[1].query_end) - extra
            end = hit[1].sbjct_start + hit[1].query_start + extra - 1
            strand.append(-1)
            d = {'query_start': hit[1].sbjct_end, 'query_end': hit[1].sbjct_start,
                 'extended_start': hit[1].sbjct_end - _positive_index(query_length - hit[1].query_end),
                 'extended_end': hit[1].sbjct_start + hit[1].query_start - 1,
                 'strand': -1}
        else:
            # this is hit to plus strand
            start = hit[1].sbjct_start - hit[1].query_start + 1 - extra
            end = hit[1].sbjct_end + _positive_index(query_length - hit[1].query_end) + extra
            strand.append(1)
            d = {'query_start': hit[1].sbjct_start, 'query_end': hit[1].sbjct_end,
                 'extended_start': hit[1].sbjct_start - hit[1].query_start + 1,
                 'extended_end': hit[1].sbjct_end + _positive_index(query_length - hit[1].query_end),
                 'strand': 1}

        # ====== information about possible trim ======
        # assume ok
        d['trimmed_ss'] = False
        d['trimmed_se'] = False
        d['trimmed_es'] = False
        d['trimmed_ee'] = False

        d['super_start'] = start
        d['super_end'] = end

        if start < 1:
            start = 1                    # index from which sequence should be retrieved from the db
            d['trimmed_ss'] = True

        # repair possible extended start violation
        if d['extended_start'] < 1:
            d['trimmed_es'] = True

        # add blast record
        d['blast'] = hit

        try:
            bdb_accession = match_acc(hit[0], blast_regexp)
        except exceptions.AccessionMatchException as e:
            raise e

        d['blast'][0] = bdb_accession

        # read from file
        if format in ['fasta', 'gb']:
            ff = os.path.join(database, bdb_accession)
            if not os.path.isfile(ff):
                if skip_missing:
                    msgwarn = 'Sequence {} not found in provided db. Skipping.'.format(bdb_accession)
                    msgs.append(msgwarn)
                    ml.warning(msgwarn)
                else:
                    msgerror = 'Sequence {} not found in provided db. ' \
                               'Please provide correct database or give "--skip_missing" flag.'.format(
                        bdb_accession
                    )
                    ml.error(msgerror)
                    raise LookupError(msgerror)

            with open(ff, 'r') as handle:
                ext_seq = next(SeqIO.parse(handle, format=format))
                parsed_record = ext_seq[start - 1:end]

        elif format == 'server':
            # only used when server
            parsed_record = seqdb.load_genome(bdb_accession, start - 1, end)

        elif format == 'entrez':
            prnt_line = '{:3d}% {}'.format(floor(index * 100 / len(hits)), bdb_accession)
            if index == 0:
                sys.stdout.write('STATUS: Downloading required sequences from NCBI with entrez.\n')
                sys.stdout.write('{:50}'.format(prnt_line))
            else:
                sys.stdout.write('\r{:50}'.format(prnt_line))

            seq_id = '{}:{}-{}'.format(bdb_accession, start, end)
            if seq_id not in known_seq_index:
                try:
                    Entrez.email = entrez_email
                    parsed_record = fetch_accession_range(bdb_accession, start, end)

                    with open(temp_entrez_file, 'a') as tmpf:
                        SeqIO.write([parsed_record], tmpf, format='fasta')

                    if index == len(hits) - 1:
                        sys.stdout.write('\r{:50}\n'.format(' Done.'))
                except HTTPException as e:
                    msg = 'HTTP exception encountered: {}' \
                          'Please check your internet connection and availability of NCBI ENTREZ web services.\n ' \
                          'Also check that the requested accession number "{}" is available in NCBI "nucleotide" database.'.format(
                        e, bdb_accession)
                    if skip_missing:
                        ml.warning(msg)
                        continue
                    else:
                        ml.error(msg)
                        sys.exit(1)
                except ValueError:
                    msg = 'Received malformed fasta file. ' \
                          'Please check that requested accession number "{}" is available in NCBI nucleotide database.'.format(
                        bdb_accession)
                    if skip_missing:
                        ml.warning(msg)
                        continue
                    else:
                        ml.error(msg)
                        sys.exit(1)
            else:
                parsed_record = known_seq_index[seq_id]
        else:
            raise NotImplementedError

        record_id = parsed_record.id.split(':')[0]

        if parsed_record.description.startswith(parsed_record.id):
            parsed_record.description = parsed_record.description[len(parsed_record.id):].strip()

        parsed_record.annotations = d
        parsed_record.annotations['msgs'] = []
        # add uid to ensure that all hits are unique
        parsed_record.id = 'uid:' + str(index) + '|' + record_id

        if d['trimmed_ss']:
            if d['super_start'] + len(parsed_record.seq) < d['super_end'] + d['super_start']:
                parsed_record.annotations['trimmed_se'] = True
                if d['trimmed_es']:
                    if len(parsed_record.seq) < d['extended_end'] + d['extended_start']:
                        parsed_record.annotations['trimmed_ee'] = True
                else:
                    if len(parsed_record.seq) < d['extended_end'] - d['extended_start']:
                        parsed_record.annotations['trimmed_ee'] = True
        else:
            if d['super_start'] + len(parsed_record.seq) - 1 < d['super_end']:
                parsed_record.annotations['trimmed_se'] = True
                if d['super_start'] + len(parsed_record.seq) - 1 < d['extended_end']:
                    parsed_record.annotations['trimmed_ee'] = True

        msgsub = '{}: Sequence cannot be extended sufficiently'.format(parsed_record.id)
        if parsed_record.annotations['trimmed_ss']:
            msgwarn = msgsub + '. Missing {} nt upstream in the genome.'.format(parsed_record.annotations['super_start'])
            parsed_record.annotations['msgs'].append(msgwarn)
            ml.warning(msgwarn)
        if parsed_record.annotations['trimmed_se']:
            msgwarn = msgsub + '. Missing nt downstream in the genome.'.format(parsed_record.id)
            parsed_record.annotations['msgs'].append(msgwarn)
            ml.warning(msgwarn)
        if parsed_record.annotations['trimmed_es']:
            msgwarn = msgsub + ' by unaligned portion of query. THIS IS PROBABLY FRAGMENT!'
            msgwarn += ' Trimmed upstream.'
            parsed_record.annotations['msgs'].append(msgwarn)
            ml.warning(msgwarn)
        if parsed_record.annotations['trimmed_ee']:
            msgwarn = msgsub + ' by unalined portion of query. THIS IS PROBABLY FRAGMENT!'
            msgwarn += ' Trimmed downstream.'
            parsed_record.annotations['msgs'].append(msgwarn)
            ml.warning(msgwarn)

        exp_hits.append(parsed_record)

    # ==== Remove the entrez tempfile =====
    # here we have all the sequences and we can safely delete the tempfile
    if format == 'entrez':
        remove_one_file_with_try(temp_entrez_file)
    return exp_hits, strand