Exemplo n.º 1
0
def split_seq_file(fa):
    from os import makedirs, path
    from fasta import fasta_iter
    import gzip
    makedirs('partials', exist_ok=True)
    basename = path.basename(fa)
    cur_n = MAX_SEQ_CHUNKS + 1
    ix = 0
    out = None
    partials = []
    for h, seq in fasta_iter(fa):
        if cur_n >= MAX_SEQ_CHUNKS:
            if fa.endswith('.faa.gz'):
                partials.append(f'partials/{basename}_block_{ix:04}.faa.gz')
            elif fa.endswith('.fna.gz'):
                partials.append(f'partials/{basename}_block_{ix:04}.fna.gz')
            else:
                raise ValueError(f'Unexpected file name: {fa}')
            out = gzip.open(partials[-1], compresslevel = 1, mode = 'wt')
            cur_n = 0
            ix += 1
        out.write(f'>{h}\n{seq}\n')
        cur_n +=1
    out.close()
    return partials
Exemplo n.º 2
0
def dedup_fasta(infile):
    from fasta import fasta_iter
    import gzip
    import os
    print("start dedup")
    fasta = {}
    for ID, seq in fasta_iter(infile):
        if seq in fasta:
            fasta[seq][1] += 1
        else:
            fasta[seq] = [ID, 1]

    outfile1 = infile.replace('.faa.gz', '.raw_number.tsv.gz')
    outfile2 = infile.replace('.faa.gz', '.dedup.faa.gz')
    out1 = gzip.open(outfile1, "wt", compresslevel=1)
    out2 = gzip.open(outfile2, "wt", compresslevel=1)
    print("start sort")
    for seq, (ID, count) in sorted(fasta.items()):
        out1.write(f"{count}\t{seq}\n")
        out2.write(f">{ID}\n{seq}\n")
    out1.close()
    out2.close()
    os.unlink(infile)
    print("finish dedup and sort")
    return (outfile1, outfile2)
def features(ifile):
    groups = [set(g) for g in (GROUPS_SA + GROUPS_HB)]
    seqs = []
    headers = []
    encodings = []
    aaComp = []
    for h, seq in fasta_iter(ifile):
        if seq[-1] == '*':
            seq = seq[:-1]
        seqs.append(seq)
        headers.append(h)
        encodings.append(ctdd(seq, groups))
        aaComp.append(amino_acid_composition(seq))

    # We can do this inside the loop so that we are not forced to pre-load all
    # the sequences into memory. However, it becomes much slower
    rpy2.robjects.globalenv['seq'] = seqs
    rfeatures = r('''
    ch <- charge(seq=seq, pH=7, pKscale="EMBOSS")
    pI <- pI(seq=seq, pKscale="EMBOSS")
    aIndex <- aIndex(seq=seq)
    instaIndex <- instaIndex(seq=seq)
    boman <- boman(seq=seq)
    hydrophobicity <- hydrophobicity(seq=seq, scale="Eisenberg")
    hmoment <- hmoment(seq=seq, angle=100, window=11)
    cbind(ch, pI, aIndex, instaIndex, boman, hydrophobicity, hmoment)
    ''')

    features = np.hstack([aaComp, rfeatures, encodings])
    features = pd.DataFrame(features,
                            index=headers,
                            columns=[
                                "tinyAA",
                                "smallAA",
                                "aliphaticAA",
                                "aromaticAA",
                                "nonpolarAA",
                                "polarAA",
                                "chargedAA",
                                "basicAA",
                                "acidicAA",
                                "charge",
                                "pI",
                                "aindex",
                                "instaindex",
                                "boman",
                                "hydrophobicity",
                                "hmoment",
                                "SA.Group1.residue0",
                                "SA.Group2.residue0",
                                "SA.Group3.residue0",
                                "HB.Group1.residue0",
                                "HB.Group2.residue0",
                                "HB.Group3.residue0",
                            ])
    features.insert(0, 'group', 'Unk')
    features.insert(0, 'sequence', seqs)
    return features
Exemplo n.º 4
0
def mergeseq(outfile):
    import heapq
    import gzip
    from glob import glob
    from fasta import fasta_iter

    with gzip.open(outfile, compresslevel=1, mode='wt') as output:
        inputs = [fasta_iter(f) for f in glob(f'*.dedup.faa.gz')]
        merged = heapq.merge(*inputs, key=lambda h_seq: (h_seq[1], h_seq[0]))
        preseq = "x"
        for h, seq in merged:
            if seq != preseq:
                output.write(f'>{h}\n{seq}\n')
                preseq = seq
Exemplo n.º 5
0
def read_fasta_sequences(filename):
    """
    Function to parse text from the given FASTA file into a screed database
    """
    # Will raise an exception if the file doesn't exist
    theFile = open(filename, "rb")

    # Setup the iterator function
    iterfunc = fasta.fasta_iter(theFile)

    # Create the screed db
    create_db(filename, fasta.FieldTypes, iterfunc)
    theFile.close()

    return ScreedDB(filename)
Exemplo n.º 6
0
def splitseq(infile):
    print("start splitseq")
    outputlist = [f'split/submetag_{ix:03}.faa.gz' for ix in range(256)]
    outputfiles = [
        gzip.open(f'split/submetag_{ix:03}.faa.gz', compresslevel=1, mode='wt')
        for ix in range(256)
    ]
    for ID, seq in fasta_iter(infile):
        h = hashlib.sha256()
        h.update(seq.encode('ascii'))
        ix = int(h.hexdigest()[:2], 16)
        outputfiles[ix].write(f'>{ID}\n{seq}\n')
    for ot in outputfiles:
        ot.close()
    print("finish splitseq")
    return (outputlist)
Exemplo n.º 7
0
def calseq(infile, outfile1, outfile2):
    from fasta import fasta_iter
    import gzip
    fasta = {}
    for ID, seq in fasta_iter(infile):
        if seq in fasta:
            fasta[seq][1] += 1
        else:
            fasta[seq] = [ID, 1]
    out1 = gzip.open(outfile1, "wt", compresslevel=1)
    out2 = gzip.open(outfile2, "wt", compresslevel=1)
    for seq, (ID, count) in fasta.items():
        out1.write(f"{count}\t{seq}\n")
        out2.write(f">{ID}\n{seq}\n")
    out1.close()
    out2.close()
Exemplo n.º 8
0
def extract_seq(infile1,infile2,outfile1,outfile2):
    from fasta import fasta_iter
    import gzip
    fastaset=set()
    with gzip.open(infile1,"rt") as f:
        for line in f:
            line = line.strip()
            linelist = line.split("\t")
            if linelist[0] != "1":
                fastaset.add(linelist[1])

    with gzip.open(outfile1, "wt", compresslevel=1) as out1, \
        gzip.open(outfile2, "wt", compresslevel=1) as out2:
        for ID,seq in fasta_iter(infile2):
            if seq in fastaset:
                out1.write(f'>{ID}\n{seq}\n')
            else:
                out2.write(f'>{ID}\n{seq}\n')
Exemplo n.º 9
0
def splitseq(infile, X, outfile):
    '''split inputfile according to max number of sequences X'''
    import gzip
    from fasta import fasta_iter

    ix = 0
    n = 0
    oname = outfile.format(ix=ix)
    out = gzip.open(oname, "wt", compresslevel=1)
    for ID, seq in fasta_iter(infile):
        if n < X:
            n += 1
            out.write(f'>{ID}\n{seq}\n')
        else:
            n = 1
            ix += 1
            oname = outfile.format(ix=ix)
            out = gzip.open(oname, "wt", compresslevel=1)
            out.write(f'>{ID}\n{seq}\n')
        if not ID:
            break
    out.close()