Exemplo n.º 1
0
def _msa_count_columns(input_fn):
    """Count columns in a MSA in FASTA format.
    """
    with open(input_fn, "rU") as input_handle:
        parser = SimpleFastaParser(input_handle)
        try:
            title, seq = parser.next()
        except StopIteration:
             raise ValueError("{} is not a valid FASTA template file"
                              .format(input_fn))
        seqlen = len(seq)
        for title, seq in parser:
            if len(seq) != seqlen:
                raise ValueError("sequences in template file must all be the "
                                 "same length")
    return seqlen
Exemplo n.º 2
0
def read_fasta(files, in_type=Peptide, id_position=1):
    """
    Generator function:

    Read a (couple of) peptide, protein or rna sequence from a FASTA file.
    User needs to specify the correct type of the underlying sequences. It can
    either be: Peptide, Protein or Transcript (for RNA).

    :param files: A (list) of file names to read in
    :in_type files: list(str) or str
    :param in_type: The type to read in
    :type in_type: :class:`~Fred2.Core.Peptide.Peptide` or :class:`~Fred2.Core.Transcript.Transcript`
                or :class:`~Fred2.Core.Protein.Protein`
    :param int id_position: the position of the id specified counted by |
    :returns: a list of the specified sequence type derived from the FASTA file sequences.
    :rtype: (list(:attr:`in_type`))
    :raises ValueError: if a file is not readable
    """

    if isinstance(files, basestring):
        files = [files]
    else:
        if any(not os.path.exists(f) for f in files):
            raise ValueError("Specified Files do not exist")

    collect = set()
    # open all specified files:
    for name in files:
        with open(name, 'r') as handle:
            # iterate over all FASTA entries:
            for _id, seq in SimpleFastaParser(handle):
                # generate element:
                try:
                    _id = _id.split("|")[id_position]
                except IndexError:
                    _id = _id

                try:
                    collect.add(in_type(seq.strip().upper(),
                                        transcript_id=_id))
                except TypeError:
                    collect.add(in_type(seq.strip().upper()))
    return list(collect)
def main(fasta_file, bin_composition, set_bins, output):
    Dico_contigs_bin = {
        line.rstrip().split(",")[0]: line.rstrip().split(",")[1]
        for line in open(bin_composition)
        if line.rstrip().split(",")[1] in set_bins
    }
    Dico_bin_Handle = {}
    for bins in set(Dico_contigs_bin.values()):
        Dico_bin_Handle[bins] = open(
            output + "/Bin_" + bins + "." + fasta_file.split(".")[-1], "w")
    for contig_id, seq in SimpleFastaParser(open(fasta_file)):
        contig_id2 = contig_id.split()[0]
        if contig_id2 in Dico_contigs_bin:
            Dico_bin_Handle[Dico_contigs_bin[contig_id2]].write(">" +
                                                                contig_id +
                                                                "\n" + seq +
                                                                "\n")
    for handle in Dico_bin_Handle.values():
        handle.close()
Exemplo n.º 4
0
def _rename_seqids(input_fn, otuids_fn, prefix=""):
    output_dir = os.path.dirname(input_fn)

    tmp_fn = micca.ioutils.make_tempfile(output_dir)
    input_handle = open(input_fn, "rU")
    otuids_handle = open(otuids_fn, "wb")
    tmp_handle = open(tmp_fn, "wb")

    for i, (title, seq) in enumerate(SimpleFastaParser(input_handle)):
        origid = title.split()[0]
        newid = "{}{:d}".format(prefix, i + 1)
        otuids_handle.write("{}\t{}\n".format(newid, origid))
        tmp_handle.write(">{}\n{}\n".format(newid, seq))

    tmp_handle.close()
    input_handle.close()
    otuids_handle.close()

    os.rename(tmp_fn, input_fn)
def minimise_large_cluster_1(nodes, orig_roary_dirs):
    ''' find identical sequences in large clusters using cdhit
    and remove those nodes from the graph G to make everything smoother'''
    tmp_out = open("1_cluster_fasta.fa", "w")
    members_per_cluster = {}
    for m in nodes:
        cluster = m.split("_")[0]
        if cluster not in members_per_cluster:
            members_per_cluster[cluster] = []
        members_per_cluster[cluster].append("_".join(m.split("_")[1:]))

    for curr_cluster in members_per_cluster:
        for d in orig_roary_dirs:
            if d.split("/")[-1].split("_")[0] == str(curr_cluster):
                ref_dir = d
                break
        with open(
                os.path.join(ref_dir, prefix +
                             "mode_pan_genome_reference.fa")) as handle:
            for values in SimpleFastaParser(handle):
                name = values[0].split()[1]
                if name in members_per_cluster[curr_cluster]:
                    ## THis output proves how choosing a different reference can really alter the results.
                    tmp_out.write(">" + curr_cluster + "_" + name + "\n" +
                                  values[1].split(";")[0] + "\n")
    tmp_out.close()
    ## run cdhit to reduce the number of sequences
    p = subprocess.Popen([
        "cd-hit-est", "-i", "1_cluster_fasta.fa", "-o",
        "1_cluster_clustered.fa", "-c", "0.9", "-T", "4", "-d", "0", "-A",
        "0.9", "-n", "8", "-s", "0.9"
    ])
    p.wait()
    nodes_to_remove = []
    with open("1_cluster_clustered.fa.clstr") as f:
        for line in f:
            if line.startswith(">"):
                continue
            member = line.strip().split("...")[0].split(">")[-1]
            if "*" not in line:
                nodes_to_remove.append(member)
    return nodes_to_remove
Exemplo n.º 6
0
def Sortbysize(input, n50, minlen=500):
    contigs = []
    keep = []
    Seqs = []
    with open(input, 'r') as infile:
        for header, sequence in SimpleFastaParser(infile):
            Seqs.append((header, len(sequence)))
    # sort by length
    sortedSeqs = sorted(Seqs, key=lambda x: x[1], reverse=True)
    # loop through and return contigs and keepers
    for name, length in sortedSeqs:
        if length >= minlen:
            if n50:
                if length >= n50:
                    keep.append(name)
                else:
                    contigs.append(name)
            else:
                contigs.append(name)
    return contigs, keep
Exemplo n.º 7
0
def count_total_c_and_g_in_reads(ref_name, cg_posits, in_file):

    c_counter = 0
    g_counter = 0
    nucs = ["A", "T", "G", "C"]

    with open("./input_data/" + in_file) as in_handle:
        for record in SimpleFastaParser(in_handle):

            if record[0] == ref_name:
                continue
            for c_index in cg_posits["C_pos"]:
                if record[1][c_index] in nucs:
                    c_counter += 1

            for g_index in cg_posits["G_pos"]:
                if record[1][g_index] in nucs:
                    g_counter += 1

    return c_counter, g_counter
Exemplo n.º 8
0
def qc_fasta(fasta_file, qc, out, name=""):
    ''' go over FASTA file and see if it meets QC requirements
    Output: LOG of number of contigs and length.
    Return: True, if it does
    False: If it doesn't, and remove the file '''
    if name == "":
        name = fasta_file

    num_contigs = 0
    length = 0
    with open(fasta_file) as handle:
        for values in SimpleFastaParser(handle):
            num_contigs += 1
            length += len(values[1].strip())
    length = length / 1000000.0
    if qc["max_contigs"] < num_contigs or qc["min_length"] > length or qc[
            "max_length"] < length:
        out.write(name + "," + str(num_contigs) + "," + str(length) + "\n")
        return False
    return True
Exemplo n.º 9
0
def launch_seq_kmers_pool( fastx, ftype, k, threads, target_range, combined_kmers, count, frac ):
    
    args      = []
    lengths_d = {}

    if ftype=="fastq":
        for read_num, (read_id, seq, qual) in enumerate(FastqGeneralIterator(open(fastx))):
            args,status = build_args_for_kmer_calc(read_num, target_range, args, read_id, seq, k, combined_kmers, lengths_d, count, frac)
            if status=="over":
                break

    elif ftype=="fasta":
        for read_num, (read_id, seq) in enumerate(SimpleFastaParser(open(fastx))):
            args,status = build_args_for_kmer_calc(read_num, target_range, args, read_id, seq, k, combined_kmers, lengths_d, count, frac)
            if status=="over":
                break
    
    results = launch_pool( threads, calc_seq_kmer_freqs, args )
    
    return dict(results), lengths_d
def quick_FASTA_reader(file):
    """Simple FASTA reader, returning a list of string tuples (DEPRECATED).

    The single argument 'file' should be the filename of a FASTA format file.
    This function will open and read in the entire file, constructing a list
    of all the records, each held as a tuple of strings (the sequence name or
    title, and its sequence).

    >>> seqs = quick_FASTA_reader("Fasta/dups.fasta")
    >>> for title, sequence in seqs:
    ...     print("%s %s" % (title, sequence))
    alpha ACGTA
    beta CGTC
    gamma CCGCC
    alpha (again - this is a duplicate entry to test the indexing code) ACGTA
    delta CGCGC

    This function was is fast, but because it returns the data as a single in
    memory list, is unsuitable for large files where an iterator approach is
    preferable.

    You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which
    allows you to iterate over the records one by one (avoiding having all the
    records in memory at once).  Using Bio.SeqIO also makes it easy to switch
    between different input file formats.  However, please note that rather
    than simple strings, Bio.SeqIO uses SeqRecord objects for each record.

    If you want to use simple strings, use the function SimpleFastaParser
    added to Bio.SeqIO.FastaIO in Biopython 1.61 instead.
    """
    import warnings
    from Bio import BiopythonDeprecationWarning
    warnings.warn(
        "The quick_FASTA_reader has been deprecated and will be "
        "removed in a future release of Biopython. Please try "
        "function SimpleFastaParser from Bio.SeqIO.FastaIO "
        "instead.", BiopythonDeprecationWarning)
    from Bio.SeqIO.FastaIO import SimpleFastaParser
    with open(file) as handle:
        entries = list(SimpleFastaParser(handle))
    return entries
Exemplo n.º 11
0
def cat_by_id(in_fastas=False, out_name="cat_by_id_"):
    """
    cats sequences by their ids from separate fasta files 
    into one sequence and outputs a single file
    ----------------
    in_fastas: list or tuple
        input fasta files
    out_name: str
        name for output fasta file
    ----------------    
    NOTE: files must start with number like this : '1_', 
    to denote the order of concatenation
    """
    valid_extensions = ["fasta", "fas", "fa"]

    if in_fastas:
        intersected_ids = _get_intersected_ids(in_fastas)
    else:
        in_fastas = os.listdir("./")
        in_fastas = [
            f for f in in_fastas if f.rsplit(".", 1)[-1] in valid_extensions
        ]
        intersected_ids = _get_intersected_ids(in_fastas)

    intersected_ids.sort()  # to sort record by their number, which goes first
    intersected_records = []
    for seq_id in intersected_ids:
        cat_seq = ""
        for f in in_fastas:
            with open(f) as in_handle:
                for title, seq in SimpleFastaParser(in_handle):
                    if title == seq_id:
                        cat_seq += seq
                        break
        intersected_records.append(
            SeqRecord(Seq(cat_seq), id=title, description=""))

    time_stamp = _get_current_time()
    time_stamp = _format_time_stamp(time_stamp)

    SeqIO.write(intersected_records, out_name + time_stamp + ".fasta", "fasta")
Exemplo n.º 12
0
def check_property(input, stop_table):
    global uniqs
    global out_df
    with open(input) as infasta:
        uniqs = []  # to save row names
        freq_list = []  # to save frequency info
        len_list = []  # to save length info
        stop_list = []  # to save stop codon
        #
        # LOOP (is the best!)
        print('#Start checking freq and len#')
        # size = len([head for head, seq in SimpleFastaParser(infasta)]) # progress bar
        # step = 0 # progress bar
        # bar = 20 # progress barlength
        for head, seq in SimpleFastaParser(infasta):
            uniqs.append(
                head.split(';')[0].replace('>', '')
            )  ## split by ; symbol, keep the first part then remove the > symbol
            freq_list.append(freq_check(head))  # add frequency info
            len_list.append(length_check(seq))  # add length info
            stop_codon = min(
                stopcount(
                    SeqRecord(Seq(seq)), stop_table,
                    frame=(1, 2,
                           3)))  # check stop codon and retain the minimum
            stop_list.append(stop_codon)  # append stop codon info
            # step += 1 # progress bar
            # percen = step/size # progress bar
            # heases = '#'* int(percen*bar)# progress bar
            # spaces = '-'* (bar - len(hashes))# progress bar
            # percen = round(percen*100,2)# progress bar
            # sys.stdout.write("\r %d%% |%s| %d/%d lines"%(percen,heases+spaces,step, size))# progress bar
            # sys.stdout.flush()# progress bar
        #
        #
        out_df = pd.DataFrame(data=uniqs,
                              columns=['name'])  # reate dataframe for data
        out_df['freq'] = freq_list  # add list to dataframe
        out_df['length'] = len_list
        out_df['stop_codon'] = stop_list
    infasta.close()
def get_assembly_stats(assembly_fp):
    # Get contig lengths
    with assembly_fp.open('r') as f:
        contig_lengths = [len(s) for d, s in SimpleFastaParser(f)]

    # Calculate stats
    contig_number = len(contig_lengths)
    length = sum(contig_lengths)
    smallest = min(contig_lengths)
    largest = max(contig_lengths)
    mean = int(round(statistics.mean(contig_lengths), 0))
    q1, q2, q3 = calculate_quartiles(contig_lengths)

    # n50 func requires more than one contig
    if contig_number > 1:
        n50 = calculate_n50(contig_lengths, length / 2)
    else:
        n50 = largest

    # Return ordered stats
    return contig_number, n50, q1, q2, q3, mean, smallest, largest, length
Exemplo n.º 14
0
    def read_fasta(fasta, length=False):
        """
		Read fasta format file in.
		Parameters:
		-----------
		fasta:str
			fasta format file
		length:bool
			output length instead of sequence, default False.
		
		Returns:
		--------
		Return a dict as id & sequence/length of seqeunce as key-value pairs.
		"""
        seqs = {}
        fh = files.perfect_open(fasta)
        for t, seq in SimpleFastaParser(fh):
            seqs[t] = seq
            if length: seqs[t] = len(seq)
        fh.close()
        return seqs
Exemplo n.º 15
0
def main():
  #To parse command line
  usage = "usage: %prog [options]"
  p = optparse.OptionParser(usage)
  
  p.add_option('-i', '--input', help='Input fasta [None,REQD]')
  p.add_option('--min', type="int", default=1000, help="Minimum size of record to keep [1000]")
  p.add_option('--max', type="int", help="Maximum size of record to keep [None]")
  p.add_option('-o', '--output', help='Output fasta [None,REQD]')
  
  opts, args = p.parse_args()

  with open(opts.input, "r") as fin:
    with open(opts.output, "w") as fout:
      for record in SimpleFastaParser(fin):
        if opts.max:
          if len(record[1]) >= opts.min and len(record[1]) <= opts.max:
            fout.write(">%s\n%s\n" % (record[0], record[1]))
        else:
          if len(record[1]) >= opts.min:
            fout.write(">%s\n%s\n" % (record[0], record[1]))
Exemplo n.º 16
0
def _count_ins(input_file, ref_seq, ref_seq_id, cov):

    total_ins = _get_ref_array(ref_seq)

    with open("./input_data/" + input_file) as in_handle:
        reads = SimpleFastaParser(in_handle)
        for record in reads:
            if record[0] == ref_seq_id:
                continue

            read = record[1]
            pairs_gaps_cut = _split_gaps_from_pairs(ref_seq, read)

            # take reference, not read. different from deletions counting
            seq_to_count = [x[0] for x in pairs_gaps_cut]
            seq_to_count_ins = "".join(seq_to_count)

            # skip if read has noly gaps. it's not valid to count
            if _seq_has_gaps_only(seq_to_count_ins):
                continue

            ins_start_positions = _get_indel_start_positions(seq_to_count_ins)

            ins_pos_and_lens = _count_indel_lens(seq_to_count_ins,
                                                 ins_start_positions)

            if not all(value == None
                       for key, value in ins_pos_and_lens.items()):
                _write_read_into_fasta(input_file, record, "a+")

            correct_ins_pos_and_lens = {}
            len_correction = 0
            for key, value in ins_pos_and_lens.items():
                correct_ins_pos_and_lens[key - len_correction] = value
                len_correction += value

            for key, value in correct_ins_pos_and_lens.items():
                total_ins[key].append(value)

    return total_ins
def make_taxonomy(fasta):
    tax = dict()

    lineages = set()
    with open(fasta) as reader:
        for (title, _) in SimpleFastaParser(reader):
            for (kw, repl) in FIXES.items():
                if kw in title:
                    title = title.replace(kw, repl)
            (acc, lin) = title.split()

            if lin.lower() not in lineages:
                tax[acc] = lin.split(";")
                lineages.add(lin.lower())

    tax_df = pd.DataFrame(tax).T
    tax_df.index.name = "acc"
    tax_df.columns = [
        "kingdom", "phylum", "class", "order", "family", "genus", "species"
    ]

    return tax_df
Exemplo n.º 18
0
def extract(input_file, parameters, output_file, file_type):
    read_id,start,stop,margin,strand = parameters.split(':')
    start = int(start)
    stop = int(stop)
    margin = int(margin)    
    
    handle = open(output_file, "w")
    for title, seq in SimpleFastaParser(open(input_file)):
        if (title == read_id):
            subseq = seq[start-1-margin:stop+margin]
            if strand == '+':                
                handle.write(">%s\n%s\n" % (title, subseq))
            elif strand == '-':
                subseq = str( Seq.Seq(subseq).reverse_complement() )
                handle.write(">%s\n%s\n" % (title, subseq))                
            else:
                print('Strand is not specified correctly!)
                return                          
            
    handle.close()

    return
Exemplo n.º 19
0
def delete_N(input_file, parameters, output_file, file_type):
    print('\nReading your file... \n')

    print('Searching reads containing N \n')

    if file_type == "fastq":
        handle = open(output_file, "w")
        for title, seq, qual in FastqGeneralIterator(open(input_file)):
            if 'N' not in seq.upper():
                handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
        handle.close()

    else:
        handle = open(output_file, "w")
        for title, seq in SimpleFastaParser(open(input_file)):
            if 'N' not in seq.upper():
                handle.write(">%s\n%s\n" % (title, seq))
        handle.close()

    print('Reads containing N are written to {} \n'.format(output_file))

    return
def add_rep_seqs_to_feature_table(feature_table, rep_seq_filepath):
    """
    Adds representative sequences as the ReprSequences column to a QIIME2 feature table

    :param feature_table: QIIME2 FeatureTable[Frequency] artifact loaded as a pandas DataFrame
    :param rep_seq_filepath: Path to the dna-sequences.fasta file output by the QIIME2 denoising/clustering step
    :return: QIIME2 FeatureTable[Frequency] artifact with representative sequences in the ReprSequences column
    """
    # Check if ReprSequence column already exists
    if 'ReprSequence' in feature_table.columns.values.tolist():
        logger.error(
            '"ReprSequence" column already exists in provided feature table. '
            'Cannot add representative sequences. Exiting...')
        sys.exit(1)

    # Load the FastA file as a pandas dataframe
    # Based on https://stackoverflow.com/a/19452991 (accessed Sept. 12, 2019)
    logger.info('Loading representative sequences FastA file')
    with open(rep_seq_filepath, 'r') as fasta_data:
        fasta_ids = []
        fasta_seqs = []

        for id, seq in SimpleFastaParser(fasta_data):
            fasta_ids.append(id)
            fasta_seqs.append(seq)

    rep_seq_dict = {'Feature ID': fasta_ids, 'ReprSequence': fasta_seqs}
    rep_seq_table = pd.DataFrame(rep_seq_dict)

    # Merge
    logger.debug('Adding representative sequences')
    feature_table = pd.merge(feature_table,
                             rep_seq_table,
                             how='left',
                             on='Feature ID',
                             sort=False,
                             validate='one_to_one')

    return (feature_table)
Exemplo n.º 21
0
def filterFasta(assemblies, binid, output, ncontigs, field, reverse):
    """
    Parse each fasta defline and filter if in binID or write non-binners
    """

    contigList = readBinID(binid, ncontigs, field)

    with open(output, 'w') as o:
        for assembly in assemblies:
            with open(assembly) as f:
                for values in SimpleFastaParser(f):
                    defline = values[0]
                    index = int(values[0].split('_')[1])
                    if (reverse and contigList[index] == 0):
                        o.write('>' + defline + '\n')
                        o.write(values[1] + '\n')
                    elif (not reverse and contigList[index] == 1):
                        o.write('>' + defline + '\n')
                        o.write(values[1] + '\n')
                    else:
                        pass
    return 0
Exemplo n.º 22
0
def fasta_to_pandas(path, separator=";"):
    """

    Args:
        path: of the fasta file
        separator: used in title of fasta file entry

    Returns: pandas dataframe with 3 columns (id, title, sequence)

    """
    with open(path) as fasta_file:
        identifiers, sequences, titles = [], [], []
        for title, sequence in SimpleFastaParser(fasta_file):
            title_parts = title.split(separator, 1)
            identifiers.append(title_parts[0])  # First word is ID
            titles.append("|".join(title_parts[1:]))
            sequences.append(sequence)
        return pd.DataFrame({
            "id": identifiers,
            "title": titles,
            "sequence": sequences
        })
Exemplo n.º 23
0
def parse_and_exclude(infile, exclusion_list, fastx_type):
    """
    Takes in path to a fasta or fastq. Keeps sequences whose IDs are not in the
    exclusion_list.
    """

    result = ""

    with open(infile) as infile_handle:

        # Parse based on type
        if fastx_type == "fasta":
            for title, seq in SimpleFastaParser(infile_handle):
                if not title in exclusion_list:
                    result += ">{}\n{}\n".format(title, seq)

        elif fastx_type == "fastq":
            for title, seq, qual in FastqGeneralIterator(infile_handle):
                if not title in exclusion_list:
                    result += "@{}\n{}\n+\n{}\n".format(title, seq, qual)

    return result
Exemplo n.º 24
0
def fasta_to_saf(path, compression="infer"):
    """
    #     GeneID	Chr	Start	End	Strand
    # http://bioinf.wehi.edu.au/featureCounts/

    # Useful:
    import re
    record_id = "lcl|NC_018632.1_cds_WP_039228897.1_1 [gene=dnaA] [locus_tag=MASE_RS00005] [protein=chromosomal replication initiator protein DnaA] [protein_id=WP_039228897.1] [location=410..2065] [gbkey=CDS]"
    re.search("\[locus_tag=(\w+)\]", record_id).group(1)
    # 'MASE_RS00005'

    """

    saf_data = list()

    if path == "stdin":
        f = sys.stdin
    else:
        f = get_file_object(path,
                            mode="read",
                            compression=compression,
                            verbose=False)

    for id_record, seq in pv(SimpleFastaParser(f),
                             "Reading sequences [{}]".format(path)):
        id_record = id_record.split(" ")[0]
        fields = [
            id_record,
            id_record,
            1,
            len(seq),
            "+",
        ]
        saf_data.append(fields)
    if f is not sys.stdin:
        f.close()
    return pd.DataFrame(saf_data,
                        columns=["GeneID", "Chr", "Start", "End", "Strand"])
Exemplo n.º 25
0
def output_file_parser(folder, prefix):
    """Collect the set of results from a folder."""
    output = {}

    # Collect the FASTA records for contigs, transcripts, and proteins
    for tag, file_ending in [
        ("contigs", ".fna"),
        ("transcripts", ".ffn"),
        ("proteins", ".faa"),
    ]:
        filepath = os.path.join(folder, prefix + file_ending)
        if os.path.exists(filepath):
            # Read in the FASTA
            logging.info("Reading in {}".format(filepath))
            records = [r for r in SimpleFastaParser(open(filepath, "rt"))]
            output[tag] = records

    # Record the features from the TSV
    features_fp = os.path.join(folder, prefix + ".tsv")
    if os.path.exists(features_fp):
        logging.info("Reading in {}".format(features_fp))
        output["features"] = read_tsv(features_fp)

    # Also read in the Genbank file
    genbank_fp = os.path.join(folder, prefix + ".gbk")
    if os.path.exists(genbank_fp):
        logging.info("Reading in {}".format(genbank_fp))
        with open(genbank_fp, "rt") as f:
            output["genbank"] = f.readlines()

    # Also read in the GFF file
    gff_fp = os.path.join(folder, prefix + ".gff")
    if os.path.exists(gff_fp):
        logging.info("Reading in {}".format(gff_fp))
        with open(gff_fp, "rt") as f:
            output["gff"] = f.readlines()

    return output
Exemplo n.º 26
0
def insert_reference_genes(args, temp_dir, cxn):
    """Prepare reference sequences for exonerate."""
    batch = []

    ref_genes = args.reference_genes

    log.info('Preparing reference genes: {}'.format(ref_genes))

    with open(ref_genes) as ref_in:

        for ref_name, ref_seq in SimpleFastaParser(ref_in):

            ref_name = util.clean_name(ref_name)

            ref_file = abspath(join(temp_dir, '{}.fasta'.format(ref_name)))

            batch.append({
                'ref_name': ref_name,
                'ref_seq': ref_seq,
                'ref_file': ref_file
            })

    db.insert_reference_genes(cxn, batch)
Exemplo n.º 27
0
def insert_exonerate_results(cxn, iteration, results_file):
    """Insert the exonerate results into the database."""
    ExonerateHeader = namedtuple(
        'ExonerateHeader',
        ['ref_name', 'taxon_name', 'contig_name', 'beg', 'end'])

    batch = []
    with open(results_file) as results_fasta:
        for header, seq in SimpleFastaParser(results_fasta):
            header = header.split(',')
            field = ExonerateHeader(*header)
            result = {
                'ref_name': field.ref_name,
                'taxon_name': field.taxon_name,
                'contig_name': field.contig_name,
                'beg': field.beg,
                'end': field.end,
                'iteration': iteration,
                'seq': seq
            }
            batch.append(result)

    db.insert_exonerate_results(cxn, batch)
Exemplo n.º 28
0
def createGFF(file, output):
    '''
    Program designed to take in a BAM file and output a GFF file
    containing the information of contigs
    '''
    with open(output, 'w') as o:
        with open(file) as f:
            for values in SimpleFastaParser(f):
                seqname = values[0]
                end = seqname.split('_')[3]
                source = "bowtie2"
                feature = "contig"
                start = '0'
                score = '40'
                strand = "."
                frame = "."
                attribute = ' '
                writeline = '\t'.join([
                    seqname, source, feature, start, end, score, strand, frame,
                    attribute
                ]) + '\n'
                o.write(writeline)
    return 0
    def test_separator(protein_file, separator):
        # 1) Check the separator divides all fasta headers in 2 parts
        # 2) Find out the order of the contig and protein parts in the header
        total_proteins = set([])
        total_contigs = set([])

        with open(protein_file) as in_handle:
            for id, seq in SimpleFastaParser(in_handle):
                split_id = id.split(separator)
                assert (
                    len(split_id) == 2
                ), 'header {} in file {} does not work with separator {}'.format(
                    id, protein_file, separator)
                total_proteins.add(split_id[0])
                total_contigs.add(split_id[1])
        assert (
            len(total_proteins) != len(total_contigs)
        ), 'Protein file {} does not look properly formatted (same number of protein IDs and contig IDs)'.format(
            protein_file)
        if len(total_proteins) > len(total_contigs):
            return False
        else:
            return True
Exemplo n.º 30
0
def extract_kmers(input_path: str, kmer_size: int) -> List[Tuple[str, str]]:
    if not isfile(input_path):
        raise AssertionError
    with open(input_path) as FH:
        oligos_list: List[Tuple[str, str]] = []
        for record_header, record_sequence in tqdm(
            SimpleFastaParser(FH), desc="Parsing region", leave=False
        ):
            for i in tqdm(
                range(len(record_sequence) - kmer_size + 1),
                desc="Extracting oligos",
                leave=False,
            ):
                oligo_sequence = str(record_sequence)[slice(i, i + kmer_size)]
                if "N" in oligo_sequence:
                    continue
                oligos_list.append(
                    (
                        f"{record_header}|{i+1}:{i+kmer_size+1}",
                        oligo_sequence,
                    )
                )
    return oligos_list
Exemplo n.º 31
0
def fasta_to_numpy(path, length):
    """

    Args:
        path: of the fasta file
        separator: used in title of fasta file entry

    Returns: numpy array of sequences

    """
    with open(path) as fasta_file:
        sequences = []
        for title, sequence in SimpleFastaParser(fasta_file):
            sequence = sequence[:length]
            to_pad = length - len(sequence)
            sequence = sequence.rjust(len(sequence) - (to_pad // 2), '0')
            sequence = sequence.ljust(length, '0')
            if len(sequence) < length:
                print(sequence.rjust(to_pad // 2, '0'))
                print(to_pad, to_pad // 2, length - len(sequence))
            np_seq = np.asarray([AMINO_ACID_TO_ID[a] for a in sequence])
            sequences.append(np_seq)
        return np.stack(sequences, axis=0)