Exemplo n.º 1
0
def BINARY(fastas, **kw):
    if checkFasta.checkFasta(fastas) == False:
        print(
            'Error: for "BINARY" encoding, the input fasta sequences should be with equal length. \n\n'
        )
        return 0

    AA = 'ARNDCQEGHILKMFPSTWYVX'
    encodings = []
    header = ['#']
    for i in range(1, len(fastas[0][1]) * 21 + 1):
        header.append('BINARY.F' + str(i))
    encodings.append(header)

    for i in fastas:
        name, sequence = i[0], i[1]
        code = [name]
        for aa in sequence:
            if aa == '-':
                code = code + [
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0
                ]
                break
            for aa1 in AA:
                tag = 1 if aa == aa1 else 0
                code.append(tag)
        encodings.append(code)
    return encodings
Exemplo n.º 2
0
def PSSM(fastas, **kw):
    if checkFasta.checkFasta(fastas) == False:
        print(
            'Error: for "PSSM" encoding, the input fasta sequences should be with equal length. \n\n'
        )
        return 0

    pssmDir = kw['path']
    if pssmDir == None:
        print(
            'Error: please specify the directory of predicted protein disorder files by "--path" \n\n'
        )
        return 0

    AA = 'ARNDCQEGHILKMFPSTWYV'

    encodings = []
    header = ['#']
    for p in range(1, len(fastas[0][1]) + 1):
        for aa in AA:
            header.append('Pos.' + str(p) + '.' + aa)
    encodings.append(header)

    for i in fastas:
        name, sequence = i[0], i[1]
        code = [name]
        if os.path.exists(pssmDir + '/' + name + '.pssm') == False:
            print('Error: pssm prfile for protein ' + name +
                  ' does not exist.')
            #sys.exit(1)
            proteinSeq = ''
            pssmMatrix = []
            continue

        with open(pssmDir + '/' + name + '.pssm') as f:
            records = f.readlines()[3:29]

        proteinSeq = ''
        pssmMatrix = []
        for line in records:
            if (len(line) == 1):
                print("got u")
                continue
            array = line.strip().split()
            pssmMatrix.append(array[2:22])
            proteinSeq = proteinSeq + array[1]

        pos = proteinSeq.find(sequence)
        if pos == -1:
            print('Warning: could not find the peptide in proteins.\n\n')
        else:
            for p in range(pos, pos + len(sequence)):
                code = code + pssmMatrix[p]
        encodings.append(code)

    return encodings
Exemplo n.º 3
0
def DisorderB(fastas, **kw):
    if checkFasta.checkFasta(fastas) == False:
        print(
            'Error: for "DisorderB" encoding, the input fasta sequences should be with equal length. \n\n'
        )
        return 0

    disDir = kw['path']
    if disDir == None:
        print(
            'Error: please specify the directory of predicted protein disorder files by "--path"'
        )
        return 0

    encodings = []
    header = ['#']
    for p in range(1, 2 * len(fastas[0][1]) + 1):
        header.append('disorderB.F' + str(p))

    encodings.append(header)
    for i in fastas:
        name, sequence = i[0], i[1]
        code = [name]
        if os.path.exists(disDir + '/' + name + '.dis') == False:
            print(
                'Error: the predicted disorder information file (.dis) for protein '
                + name + ' does not exist.')
            return 0
        with open(disDir + '/' + name + '.dis') as f:
            records = f.readlines()
        tag = 0
        for i in range(len(records)):
            if re.search('^-------', records[i]):
                tag = i
                break
        records = records[tag + 1:-1]

        proteinSeq = ''
        disValue = []
        myDict = {'D': [0, 1], 'O': [1, 0]}
        for line in records:
            array = line.rstrip().split() if line.rstrip() != '' else None
            key = array[3] if array[3] == 'D' else 'O'
            proteinSeq = proteinSeq + array[1]
            disValue.append(key)

        pos = proteinSeq.find(sequence)
        if pos == -1:
            print('Warning: could not find the peptide in proteins.\n\n')
        else:
            for p in range(pos, pos + len(sequence)):
                code = code + myDict[disValue[p]]
        encodings.append(code)

    return encodings
Exemplo n.º 4
0
def AAINDEX(fastas, **kw):
    if checkFasta.checkFasta(fastas) == False:
        print(
            'Error: for "AAINDEX" encoding, the input fasta sequences should be with equal length. \n\n'
        )
        return 0

    AA = 'ARNDCQEGHILKMFPSTWXYV'

    fileAAindex = re.sub(
        'codes$', '',
        os.path.split(os.path.realpath(__file__))
        [0]) + r'\data\AAINDEXwithX.txt' if platform.system(
        ) == 'Windows' else re.sub(
            'codes$', '',
            os.path.split(os.path.realpath(__file__))[0]) + '/data/AAindex.txt'
    with open(fileAAindex) as f:
        records = f.readlines()[1:]

    AAindex = []
    AAindexName = []
    for i in records:
        AAindex.append(i.rstrip().split()[1:] if i.rstrip() != '' else None)
        AAindexName.append(i.rstrip().split()[0] if i.rstrip() != '' else None)

    index = {}
    for i in range(len(AA)):
        index[AA[i]] = i

    encodings = []
    header = ['#']
    for pos in range(1, len(fastas[0][1]) + 1):
        for idName in AAindexName:
            header.append('SeqPos.' + str(pos) + '.' + idName)
    encodings.append(header)

    for i in fastas:
        name, sequence = i[0], i[1]
        code = [name]
        for aa in sequence:
            if aa == '-':
                for j in AAindex:
                    code.append(0)
                continue
            for j in AAindex:
                code.append(j[index[aa]])
        encodings.append(code)

    return encodings
Exemplo n.º 5
0
def TA(fastas, **kw):
    if checkFasta.checkFasta(fastas) == False:
        print(
            'Error: for "TA" encoding, the input fasta sequences should be with equal length. \n\n'
        )
        return 0

    encodings = []
    header = ['#']
    for p in range(1, len(fastas[0][1]) + 1):
        header.append('TA.F' + str(p) + '.phi')
        header.append('TA.F' + str(p) + '.psi')
    encodings.append(header)

    disDir = kw['path']
    if disDir == None:
        print(
            'Error: please specify the directory of predicted protein TA file by "--path"'
        )
        return 0
    for i in fastas:
        name, sequence = i[0], i[1]
        code = [name]
        if os.path.exists(disDir + '/' + name + '.dis') == False:
            print(
                'Error: the predicted TA information file (.spXout) for protein '
                + name + ' does not exist.')
            return 0

        with open(disDir + '/' + name + '.spXout') as f:
            records = f.readlines()[1:]

        proteinSeq = ''
        asaValue = []
        for line in records:
            array = line.strip().split() if line.strip() != '' else None
            proteinSeq = proteinSeq + array[1]
            asaValue.append(array[3:5])
        pos = proteinSeq.find(sequence)
        if pos == -1:
            print('Warning: could not find the peptide in proteins.\n\n')
        else:
            for p in range(pos, pos + len(sequence)):
                code.append(asaValue[p][0])
                code.append(asaValue[p][1])
        encodings.append(code)

    return encodings
Exemplo n.º 6
0
def ZSCALE(fastas, **kw):
    if checkFasta.checkFasta(fastas) == False:
        print(
            'Error: for "ZSCALE" encoding, the input fasta sequences should be with equal length. \n\n'
        )
        return 0

    zscale = {
        'A': [0.24, -2.32, 0.60, -0.14, 1.30],  # A
        'C': [0.84, -1.67, 3.71, 0.18, -2.65],  # C
        'D': [3.98, 0.93, 1.93, -2.46, 0.75],  # D
        'E': [3.11, 0.26, -0.11, -0.34, -0.25],  # E
        'F': [-4.22, 1.94, 1.06, 0.54, -0.62],  # F
        'G': [2.05, -4.06, 0.36, -0.82, -0.38],  # G
        'H': [2.47, 1.95, 0.26, 3.90, 0.09],  # H
        'I': [-3.89, -1.73, -1.71, -0.84, 0.26],  # I
        'K': [2.29, 0.89, -2.49, 1.49, 0.31],  # K
        'L': [-4.28, -1.30, -1.49, -0.72, 0.84],  # L
        'M': [-2.85, -0.22, 0.47, 1.94, -0.98],  # M
        'N': [3.05, 1.62, 1.04, -1.15, 1.61],  # N
        'P': [-1.66, 0.27, 1.84, 0.70, 2.00],  # P
        'Q': [1.75, 0.50, -1.44, -1.34, 0.66],  # Q
        'R': [3.52, 2.50, -3.50, 1.99, -0.17],  # R
        'S': [2.39, -1.07, 1.15, -1.39, 0.67],  # S
        'T': [0.75, -2.18, -1.12, -1.46, -0.40],  # T
        'V': [-2.59, -2.64, -1.54, -0.85, -0.02],  # V
        'W': [-4.36, 3.94, 0.59, 3.44, -1.59],  # W
        'Y': [-2.54, 2.44, 0.43, 0.04, -1.47],  # Y
        '-': [0.00, 0.00, 0.00, 0.00, 0.00],  # -
    }
    encodings = []
    header = ['#']
    for p in range(1, len(fastas[0][1]) + 1):
        for z in ('1', '2', '3', '4', '5'):
            header.append('Pos' + str(p) + '.ZSCALE' + z)
    encodings.append(header)

    for i in fastas:
        name, sequence = i[0], i[1]
        code = [name]
        for aa in sequence:
            code = code + zscale[aa]
        encodings.append(code)
    return encodings
Exemplo n.º 7
0
def EGAAC(fastas, window=5, **kw):
    if checkFasta.checkFasta(fastas) == False:
        print(
            'Error: for "EGAAC" encoding, the input fasta sequences should be with equal length. \n\n'
        )
        return 0

    if window < 1:
        print('Error: the sliding window should be greater than zero' + '\n\n')
        return 0

    group = {
        'alphaticr': 'GAVLMI',
        'aromatic': 'FYW',
        'postivecharger': 'KRH',
        'negativecharger': 'DE',
        'uncharger': 'STCPNQ'
    }

    groupKey = group.keys()

    encodings = []
    header = ['#']
    for w in range(1, len(fastas[0][1]) - window + 2):
        for g in groupKey:
            header.append('SW.' + str(w) + '.' + g)

    encodings.append(header)

    for i in fastas:
        name, sequence = i[0], i[1]
        code = [name]
        for j in range(len(sequence)):
            if j + window <= len(sequence):
                count = Counter(sequence[j:j + window])
                myDict = {}
                for key in groupKey:
                    for aa in group[key]:
                        myDict[key] = myDict.get(key, 0) + count[aa]
                for key in groupKey:
                    code.append(myDict[key] / window)
        encodings.append(code)
    return encodings
Exemplo n.º 8
0
def BLOSUM62(fastas, **kw):
	if checkFasta.checkFasta(fastas) == False:
		print('Error: for "BLOSUM62" encoding, the input fasta sequences should be with equal length. \n\n')
		return 0

	blosum62 = {
		'A': [4,  -1, -2, -2, 0,  -1, -1, 0, -2,  -1, -1, -1, -1, -2, -1, 1,  0,  -3, -2, 0],  # A
		'R': [-1, 5,  0,  -2, -3, 1,  0,  -2, 0,  -3, -2, 2,  -1, -3, -2, -1, -1, -3, -2, -3], # R
		'N': [-2, 0,  6,  1,  -3, 0,  0,  0,  1,  -3, -3, 0,  -2, -3, -2, 1,  0,  -4, -2, -3], # N
		'D': [-2, -2, 1,  6,  -3, 0,  2,  -1, -1, -3, -4, -1, -3, -3, -1, 0,  -1, -4, -3, -3], # D
		'C': [0,  -3, -3, -3, 9,  -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1], # C
		'Q': [-1, 1,  0,  0,  -3, 5,  2,  -2, 0,  -3, -2, 1,  0,  -3, -1, 0,  -1, -2, -1, -2], # Q
		'E': [-1, 0,  0,  2,  -4, 2,  5,  -2, 0,  -3, -3, 1,  -2, -3, -1, 0,  -1, -3, -2, -2], # E
		'G': [0,  -2, 0,  -1, -3, -2, -2, 6,  -2, -4, -4, -2, -3, -3, -2, 0,  -2, -2, -3, -3], # G
		'H': [-2, 0,  1,  -1, -3, 0,  0,  -2, 8,  -3, -3, -1, -2, -1, -2, -1, -2, -2, 2,  -3], # H
		'I': [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4,  2,  -3, 1,  0,  -3, -2, -1, -3, -1, 3],  # I
		'L': [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2,  4,  -2, 2,  0,  -3, -2, -1, -2, -1, 1],  # L
		'K': [-1, 2,  0,  -1, -3, 1,  1,  -2, -1, -3, -2, 5,  -1, -3, -1, 0,  -1, -3, -2, -2], # K
		'M': [-1, -1, -2, -3, -1, 0,  -2, -3, -2, 1,  2,  -1, 5,  0,  -2, -1, -1, -1, -1, 1],  # M
		'F': [-2, -3, -3, -3, -2, -3, -3, -3, -1, 0,  0,  -3, 0,  6,  -4, -2, -2, 1,  3,  -1], # F
		'P': [-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7,  -1, -1, -4, -3, -2], # P
		'S': [1,  -1, 1,  0,  -1, 0,  0,  0,  -1, -2, -2, 0,  -1, -2, -1, 4,  1,  -3, -2, -2], # S
		'T': [0,  -1, 0,  -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1,  5,  -2, -2, 0],  # T
		'W': [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1,  -4, -3, -2, 11, 2,  -3], # W
		'Y': [-2, -2, -2, -3, -2, -1, -2, -3, 2,  -1, -1, -2, -1, 3,  -3, -2, -2, 2,  7,  -1], # Y
		'V': [0,  -3, -3, -3, -1, -2, -2, -3, -3, 3,  1,  -2, 1,  -1, -2, -2, 0,  -3, -1, 4],  # V
		'-': [0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],  # -
	}
	encodings = []
	header = ['#']
	for i in range(1, len(fastas[0][1]) * 20 + 1):
		header.append('blosum62.F'+str(i))
	encodings.append(header)

	for i in fastas:
		name, sequence = i[0], i[1]
		code = [name]
		for aa in sequence:
			code = code + blosum62[aa]
		encodings.append(code)
	return encodings
Exemplo n.º 9
0
def EAAC(fastas, window=5, **kw):
    if checkFasta.checkFasta(fastas) == False:
        print(
            'Error: for "EAAC" encoding, the input fasta sequences should be with equal length. \n\n'
        )
        return 0

    if window < 1:
        print('Error: the sliding window should be greater than zero' + '\n\n')
        return 0

    if checkFasta.minSequenceLength(fastas) < window:
        print(
            'Error: all the sequence length should be larger than the sliding window :'
            + str(window) + '\n\n')
        return 0

    AA = kw['order'] if kw['order'] != None else 'ACDEFGHIKLMNPQRSTVWY'
    #AA = 'ARNDCQEGHILKMFPSTWYV'
    encodings = []
    header = ['#']
    for w in range(1, len(fastas[0][1]) - window + 2):
        for aa in AA:
            header.append('SW.' + str(w) + '.' + aa)
    encodings.append(header)

    for i in fastas:
        name, sequence = i[0], i[1]
        code = [name]
        for j in range(len(sequence)):
            if j < len(sequence) and j + window <= len(sequence):
                count = Counter(re.sub('-', '', sequence[j:j + window]))
                for key in count:
                    count[key] = count[key] / len(
                        re.sub('-', '', sequence[j:j + window]))
                for aa in AA:
                    code.append(count[aa])
        encodings.append(code)
    return encodings