示例#1
0
文件: util.py 项目: jacekrad/scie2100
def searchAndSave(searchString, filename):
    """ convenince function to search database and save
        results to a FASTA file.  If a file of that name
        already exists then the whole process is skipped
        
        searchString - this is what is being searched for
        filename - name of the file where to save the results
    """ 
    if not(path.isfile(filename)):
        sequences = []
    
        """ set of ids returned as part of the protein """
        ids = seq.searchSequences(searchString)

        print "Processing ", len(ids), " sequences ..."
        """ iterate over the ids and for each fetch the record
            from the database and append it to ex5_sequences
        """
        for seq_id in ids:
            print "Fetching sequence: ", seq_id
            sequences.append(seq.getSequence(seq_id))
            """  save the completed list of sequences to a FASTA file """
            seq.writeFastaFile(filename, sequences)
    else:
        print filename, " exists. skipping."
示例#2
0
def allMotifs_fa(args):
	#check hoow many cols, check if all of them has a value
	#make a FASTA and make a CSV

	
	for i in range (len(args.input)):
		c=0
		fasta = {}
		name = (args.input[i])
		name = name.split('.')[0]
		name = name + '_reduced.fa'

		with open(args.input[i], newline='') as f:
			reader = csv.reader(f)
			header = next(reader)

			for row in reader:
				isEmpty = False
				for i in range(1, len(header)-1):
					if row[i] == "":
						isEmpty = True
						break
				if isEmpty == False:
					fasta[row[0]] = row[len(header)-1]

		seq_list = [sequence.Sequence(sequence=seq, name=seqname) for seqname, seq in fasta.items()]
		sequence.writeFastaFile(name, seq_list)
		c+=1
		print(str(len(seq_list)) + " sequences kept after applying the requirements for " + name)
示例#3
0
def read(args):

    outputfile = output(args)

    orig_dict = {}

    if '.csv' in args.input:
        print("this is a CSV file")
        outputfile = outputfile + '.fa'
        with open(args.input, newline='') as f:
            reader = csv.reader(f)
            for row in reader:
                orig_dict[row[0]] = row[1]
        seq_list = [
            sequence.Sequence(sequence=seq, name=seqname)
            for seqname, seq in orig_dict.items()
        ]
        sequence.writeFastaFile(outputfile, seq_list)

    elif '.tab' in args.input or '.tsv' in args.input:
        print("this is a TAB/TSV file")
        outputfile = outputfile + '.fa'
        with open(args.input) as tsv:
            for line in csv.reader(tsv, dialect="excel-tab"):
                orig_dict[line[0]] = line[1]

        seq_list = [
            sequence.Sequence(sequence=seq, name=seqname)
            for seqname, seq in orig_dict.items()
        ]
        sequence.writeFastaFile(outputfile, seq_list)

    elif '.fa' in args.input or '.fasta' in args.input:
        print("this is a FASTA file")
        outputfile = outputfile + '.csv'
        db100 = sequence.readFastaFile(args.input,
                                       sequence.Protein_Alphabet,
                                       ignore=True,
                                       parse_defline=False)

        with open(outputfile, 'w', newline='') as f:
            fieldnames = ['Name', 'Sequence']
            thewriter = csv.DictWriter(f, fieldnames=fieldnames)

            thewriter.writeheader()
            for seq in db100:
                s = ''.join(seq.sequence)
                thewriter.writerow({'Name': seq.name, 'Sequence': s})
示例#4
0
def posEqual_fa(args):
	fasta = {}
	name = (args.input[0])
	name = name.split('.')[0]
	csvFile = name + '_reduced.csv'
	seqCol = 0

	with open(csvFile, newline='') as f:
		reader = csv.reader(f)
		header = next(reader)
		seqCol = len(header)-1
		
		for row in reader:
			fasta[row[0]] = row[len(header)-1]

		seq_list = [sequence.Sequence(sequence=seq, name=seqname) for seqname, seq in fasta.items()]
		sequence.writeFastaFile(name+'.fa', seq_list)
示例#5
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        help="FASTA file to query from",
                        required=True)
    parser.add_argument("-q",
                        "--query",
                        help="Query FASTA file",
                        required=True)
    parser.add_argument("-db",
                        "--database",
                        help="Database output file name",
                        required=True)
    parser.add_argument("-r",
                        "--reference",
                        help="Reference database ",
                        default="uniprotkb")
    parser.add_argument("-o",
                        "--output",
                        help="Output path",
                        default="matchmyseqs")

    args = parser.parse_args()

    seqDict = {}
    tier1seq = ''
    representative = ''
    fasta = {}
    seqsforCSV = {}
    progress = 0
    tier1 = {}
    tier1_annots = {
    }  # annotations that we want to include in the final dataset

    os.system('makeblastdb -dbtype prot -in ' + args.input + ' -out ' +
              args.database)

    db = sequence.readFastaFile(args.input,
                                sequence.Protein_Alphabet,
                                ignore=True,
                                parse_defline=False)
    db_map = {}  # map from "long" name to actual entry
    db_map_short = {}  # map from "short" name to entry
    for s in db:
        db_map[s.name] = s
        db_map_short[sequence.parseDefline(s.name)[0]] = s
    print("Database size is " + str(len(db_map)))

    print(
        "Blast started, this might take a bit depending on your dataset size")
    os.system("blastp -db " + args.database +
              " -outfmt 3 -num_descriptions 1 -num_alignments 0 -query " +
              args.query + " -out query.txt")

    if args.reference == 'uniprotkb':
        os.system(
            "grep -e \"^[st][pr]|\" query.txt | cut -d\' \' -f1 > UniProt_query.tab"
        )

        # Extract the resulting sequence identifiers
        repSeqNames = set([])
        f = open('UniProt_query.tab', 'rt')
        for row in f:
            repSeqNames.add(sequence.parseDefline(row.strip())[0])
        f.close()
        print(str(len(repSeqNames)),
              " representative sequences have been found")

        #Annot the representative sequences
        notfound = []
        for name in repSeqNames:
            if name in db_map_short:
                s = db_map_short[name]
                seqsforCSV[s.name] = "".join(s)
            else:
                notfound.append(name)
        print('Matched',
              len(repSeqNames) - len(notfound), 'of', len(repSeqNames))

        with open("query.txt", newline='') as f:
            reader = csv.reader(f)
            for row in reader:
                if len(row) > 0 and row[0].startswith('Query'):
                    querySeq = (str(row).split("=")[1][:-2].strip())
                elif len(row) > 0 and (row[0].startswith('tr|')
                                       or row[0].startswith('sp|')):
                    representative = (str(row).split(" ")[0][2:].strip())
                    seqDict[querySeq] = representative

    elif args.reference == 'refseq':
        grab = False
        repSeqNames = set([])

        with open("query.txt", newline='') as f:
            reader = csv.reader(f)
            for row in reader:
                if len(row) > 0 and row[0].startswith('Query'):
                    querySeq = (str(
                        row[0]).split("=")[1][:-2].strip().split(" ")[0])
                elif len(row) > 0 and row[0].startswith('Sequences'):
                    grab = True
                    continue
                elif grab == True:
                    if len(row) > 0 and not row[0].strip() == "":
                        representative = (row[0].split('.')[0] + "." +
                                          row[0].split('.')[1].split(" ")[0])
                        repSeqNames.add(representative)
                        seqDict[querySeq] = representative
                        grab = False
            #print(len(repSeqNames))

            notfound = []

            for name in repSeqNames:
                if name in db_map_short:
                    s = db_map_short[name]
                    seqsforCSV[s.name] = "".join(s)
                else:
                    notfound.append(name)
            print('Matched',
                  len(repSeqNames) - len(notfound), 'of', len(repSeqNames))

            print(len(repSeqNames),
                  " representative sequences found for " + args.query)

    # done25 = False
    # done50 = False
    # done75 = False
    # for s,rep in seqDict.items():
    # 	total = (len(seqDict))
    # 	seq = (sequence.getSequence(rep,'uniprotkb'))
    # 	seqsforCSV[rep] = str(seq).split(":")[1].strip()
    # 	elem = rep + str(seq)
    # 	progress+=1
    # 	if (progress/total)*100 > 25 and not done25:
    # 		print("25% done")
    # 		done25 = True
    # 	elif (progress/total)*100 > 50 and not done50:
    # 		print("50% done")
    # 		done50 = True
    # 	elif (progress/total)*100 > 75 and not done75:
    # 		print("75% done")
    # 		done75 = True

    faOut = args.output + '.fa'

    seq_list = [
        sequence.Sequence(sequence=seq, name=seqname)
        for seqname, seq in seqsforCSV.items()
    ]

    sequence.writeFastaFile(faOut, seq_list)

    csvOut = args.output + '.csv'

    with open(csvOut, 'w', newline='') as f:
        fieldnames = ['Name', 'Representative', 'Sequence']
        thewriter = csv.DictWriter(f, fieldnames=fieldnames)

        thewriter.writeheader()
        for given, rep in seqDict.items():
            thewriter.writerow({
                'Name': given,
                'Representative': rep,
                'Sequence': seqsforCSV[rep]
            })
示例#6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        help="Input FASTA file",
                        required=True)
    parser.add_argument("-db",
                        "--database",
                        help="Database output file name",
                        required=True)
    parser.add_argument("-r",
                        "--redundancy",
                        nargs='*',
                        help="List of redundancy levels",
                        default=[90, 80, 70])
    parser.add_argument("-t1", "--tier1", help="User's Tier1 sequences")
    parser.add_argument("-t2", "--tier2", help="User's Tier2 sequences")
    parser.add_argument("-ml",
                        "--maxlength",
                        help="Max length that the sequence can be",
                        default=800)
    parser.add_argument("-e",
                        "--eval",
                        nargs='*',
                        help="List of evalues",
                        default=[1e-100, 1e-75, 1e-50, 1e-20, 1e-10, 1e-5])
    args = parser.parse_args()

    tier2 = {}
    tier2_short = {}
    tier2_annots = {
    }  # annotations that we want to include in the final dataset

    if args.tier2:
        print("tier2 sequences have been provided")

        if '.fa' in args.tier2 or '.fasta' in args.tier2:
            print("tier2 sequences are FASTA file")
            tier2db = sequence.readFastaFile(args.tier2,
                                             sequence.Protein_Alphabet,
                                             ignore=True,
                                             parse_defline=False)
            print(str(len(tier2_list)) + " sequences in tier2")
            tier2_list = {}  # map from "long" name to actual entry
            tier2_map_short = {}  # map from "short" name to entry
            for s in tier2db:
                tier2_list[s.name] = s
                tier2_map_short[sequence.parseDefline(s.name)[0]] = s
        else:
            print("Please provide FASTA file for tier-2")

    if args.tier1:
        tier1 = {}
        tier1_annots = {
        }  # annotations that we want to include in the final dataset
        print("Tier-1 sequences have been provided")
        if '.fa' in args.tier1 or '.fasta' in args.tier1:

            print("Tier-1 sequences are provided as a FASTA file")
            tier1db = sequence.readFastaFile(args.tier1,
                                             sequence.Protein_Alphabet,
                                             ignore=True,
                                             parse_defline=False)
            tier1_list = {}
            for s in tier1db:
                tier1_list[s.name] = "".join(s.sequence)
            print("Tier-1 has " + str(len(tier1_list)) + " sequences")

        else:
            print("Please provide FASTA file for tier-1")

    db100 = sequence.readFastaFile(args.input,
                                   sequence.Protein_Alphabet,
                                   ignore=True,
                                   parse_defline=False)
    db100_map = {}  # map from "long" name to actual entry
    db100_map_short = {}  # map from "short" name to entry
    for s in db100:
        db100_map[s.name] = s
        db100_map_short[sequence.parseDefline(s.name)[0]] = s
    print("Database has " + str(len(db100_map)) + " sequences")

    for rr in args.redundancy:
        rs = str(rr)
        os.system('cd-hit -i ' + args.input + ' -c 0.' + rs + ' -T 5 -o db' +
                  rs + ' -d 0')

    selected = {}
    for rr in args.redundancy:
        selected[rr] = []
        filename = 'db' + str(rr) + '.clstr'
        clusters = readCDHIT(filename)
        for c in clusters:
            picked_one = False
            shortest = None
            reviewed = None
            for name in clusters[c]:
                if name in db100_map:
                    seq = db100_map[name]
                    if shortest:
                        if len(seq) < len(shortest) and not disqualified(
                                seq, args):
                            shortest = seq
                    elif not disqualified(seq, args):
                        shortest = seq
                    if seq.name.startswith('sp|') and not disqualified(
                            seq, args):
                        reviewed = seq
                    if name in tier1_list:
                        #print("this one orig" + str(seq))
                        selected[rr].append(seq)
                        picked_one = True
                else:
                    pass
                    #print('Did not find', name)
            # If no Tier-1, prefer "reviewed", then shortest length
            if not picked_one and reviewed:
                selected[rr].append(reviewed)
            elif not picked_one and shortest:
                selected[rr].append(shortest)

    for rr in args.redundancy:
        filename = 'db' + str(rr) + '.fa'
        sequence.writeFastaFile(filename, selected[rr])

    for rr in args.redundancy:
        os.system('makeblastdb -dbtype prot -in db' + str(rr) +
                  '.fa -out db-' + str(rr))

    # for rr in args.redundancy:
    #     for evalue in args.evalue:
    #         result_file = "dataset-" + str(rr) + '-'+ str(evalue)
    #         cmd1 = "blastp -db db-" + str(rr) + " -outfmt 3 -num_descriptions 20000 -num_alignments 0 -num_threads 5 -query " + args.tier1 + " -out " + result_file + ".txt -evalue " + str(evalue)
    #         print(cmd1)
    #         os.system(cmd1)

    grab = False

    for rr in args.redundancy:
        for evalue in args.eval:
            c = 0
            tpsIdentifier = set([])
            seqs = []
            result_file = "dataset-" + str(rr) + '-' + str(evalue)
            f = open(result_file + '.txt', 'rt')
            for row in f:
                if row.startswith('Sequences'):
                    grab = True
                    continue
                if grab == True:
                    if row.startswith('Lambda'):
                        grab = False
                    if not row.strip() == "":
                        identifier = row.split(' ')[0]
                        if identifier != "Lambda":
                            tpsIdentifier.add(identifier)

            for name in tpsIdentifier:
                try:
                    seq = db100_map[name]
                    info = ''
                    seqs.append(
                        sequence.Sequence(seq.sequence, seq.alphabet, seq.name,
                                          info))
                except:
                    pass
            sequence.writeFastaFile(result_file + ".fa", seqs)
            print(result_file + " has " + str(len(seqs)) + "sequences")

    print('Done')

    totalSeqCount = []
    c = 0
    for evalue in args.eval:
        for rr in args.redundancy:
            output = []
            ev = str(evalue)
            ev = ev[1:]
            red = str(rr)
            result_file = "dataset-" + str(rr) + '-' + str(evalue)
            a = sequence.readFastaFile(result_file + '.fa',
                                       sequence.Protein_Alphabet,
                                       ignore=True,
                                       parse_defline=False)

            names = set([])
            for s in a:
                names.add(s.name)
            tier1_cnt = 0
            tier2_cnt = 0
            seqs = []
            for name in names:
                try:
                    seq = db100_map[name]
                    info = ''
                    if name in tier1_list:
                        tier1_cnt += 1
                        #info = seq.info + ' ' + tier1_annots[name]
                    elif name in tier2:
                        tier2_cnt += 1
                        #info = seq.info + ' ' + tier2_annots[name]
                    seqs.append(
                        sequence.Sequence(seq.sequence, seq.alphabet, seq.name,
                                          info))
                except:
                    pass
                #print('Did not find', name)
            print('Processed', len(seqs), 'for', result_file, ' Tier-1:',
                  tier1_cnt, ' Tier-2:', tier2_cnt)
            output = [ev, red, len(seqs)]
            totalSeqCount.append(output)

    plotSeqs(totalSeqCount)
示例#7
0
print "Q5 sequence has ", len(sequences_q5), " entries"
print "Q6 sequence has ", len(sequences_q6), " entries"

ids_q5 = []
ids_q6 = []
for sequence in sequences_q5:
    ids_q5.append(sequence.name)
for sequence in sequences_q6:
    ids_q6.append(sequence.name)
    
common_ids = set(ids_q5).intersection(set(ids_q6))

print len(common_ids), " common matches found"

""" save the common entries into a FASTA file as well as a dictionary
    of id:sequence object map
"""
result_sequences = []
result_dictionary = {}

for sequence in sequences_q5:
    if sequence.name in common_ids:
        result_sequences.append(sequence)
        result_dictionary[sequence.name] = sequence

        
seq.writeFastaFile(ex7_filename, result_sequences)
print "saved results to ", ex7_filename

示例#8
0
     print('Warning: ', nodename, 'has not got a sequence')
 all_children = tree.getDescendantsOf(node, transitive=True)
 if all_children == None:
     print('Warning: ', nodename, 'has no children')
 direct_ancestors = tree.getAncestorsOf(node, transitive=True)
 if direct_ancestors == None:
     print('Warning: ', nodename, 'has no ancestors')
     direct_ancestors = []
 relevant = [node.sequence]
 for child in all_children:
     if not child.sequence:
         pass
         #print 'Warning: ', nodename, 'has a child', child.label, 'with a sequence which is None'
     else:
         allgaps = True
         for pos in child.sequence:
             if pos != '-':
                 allgaps = False
                 break
         if not allgaps:
             relevant.append(child.sequence)
 for parent in direct_ancestors:
     if not parent.sequence:
         print('Warning: ', nodename, 'has an ancestor', parent.label,
               'with a sequence which is None')
     else:
         relevant.append(parent.sequence)
 relevant_aln = sequence.Alignment(relevant)
 saveConsensus(relevant_aln, countgaps=True, filename=nodename + ".txt")
 sequence.writeFastaFile(nodename + ".fa", relevant_aln.seqs)