def __init__(self, batchfile=None, num_cpus=1): self._batchfile = batchfile self._execstr = '' self._instance = hp._THyPhy(join(abspath(hp.__file__), 'res'), num_cpus) self._stdout = '' self._stderr = '' self._warnings = ''
def __init__(self, cwd="", nthreads=1, dsid="my_ds", trid="my_tree", lfid="my_lf"): self.__instance = HyPhy._THyPhy(cwd, nthreads) self.version = self._get_version() self.dsid = dsid self.dsfid = self.dsid + "f" self.trid = trid self.lfid = lfid
def __init__(self, cwd='', nthreads=1, dsid='my_ds', trid='my_tree', lfid='my_lf'): self.__instance = HyPhy._THyPhy(cwd, nthreads) self.version = self._get_version() self.dsid = dsid self.dsfid = self.dsid + 'f' self.trid = trid self.lfid = lfid
def __init__(self, cwd=os.getcwd(), nthreads=1, alphabet='ACGT', gap_open=20, gap_open2=20, gap_extend=10, gap_extend2=10, no_terminal_penalty=1): self.__instance = HyPhy._THyPhy(cwd, nthreads) self.call('alignOptions = {};') # default settings self.set_alphabet() self.set_matrix() self.set_gap_open() self.set_gap_open(20, is_first=False) self.set_affine() self.set_gap_extend() self.set_gap_extend(10, is_first=False) self.set_terminal()
""" Use project file to punch out genes from FDA amino acid refs """ import os import HyPhy import hyphyAlign import json from seqUtils import convert_fasta hyphy = HyPhy._THyPhy (os.getcwd(), 1) # instance of HyPhy hyphyAlign.change_settings(hyphy) # default settings handle = open('fda_hcv_polyprotein.fa', 'rU') fasta = convert_fasta(handle) handle.close() handle = open('/Users/art/git/MiseqPipeline/projects.json', 'rU') proj = json.load(handle) handle.close() h77 = {} for key in proj['regions'].iterkeys(): if 'H77' in key and not key.endswith('seed'): aa = ''.join(proj['regions'][key]['reference']) h77.update({str(key): str(aa)}) outfile = open('fda_hcv_coords.fa', 'w') for h, s in fasta: for gene, refseq in h77.iteritems(): aquery, aref, ascore = hyphyAlign.pair_align(hyphy, refseq, s)
# import the HyPhy library # and standard OS utilities import os, HyPhy # first, create a HyPhy interface instance (class _THyPhy) # the first argument defines the root directory for HyPhy # and the second - how many threads the computational core # should spawn hyphyInstance = HyPhy._THyPhy(os.getcwd(), 2) # the basic interface command is 'ExecuteBF' which # executes HyPhy batch language commands in HyPhy # and returns a string representation of the return value # (if any) from HYPHY # The returned object is of type _THyPhyString with # sData and sLength fields # HyPhy will take care of disposing of the memory needed # to store the result hyphyResult = hyphyInstance.ExecuteBF("return 2+2;") print("Testing a trivial HyPhy command. 2+2 = ", hyphyResult.sData) # an optional second argument to ExecuteBF # can be used to "flush" the current state of the system # this is the default option for the call of ExecuteBF # passing the second argument of False or 0 will preserve # the execution state
""" Use project file to punch out genes from FDA amino acid refs """ import os import HyPhy import hyphyAlign import json from seqUtils import convert_fasta hyphy = HyPhy._THyPhy(os.getcwd(), 1) # instance of HyPhy hyphyAlign.change_settings(hyphy) # default settings handle = open('fda_hcv_polyprotein.fa', 'rU') fasta = convert_fasta(handle) handle.close() handle = open('/Users/art/git/MiseqPipeline/projects.json', 'rU') proj = json.load(handle) handle.close() h77 = {} for key in proj['regions'].iterkeys(): if 'H77' in key and not key.endswith('seed'): aa = ''.join(proj['regions'][key]['reference']) h77.update({str(key): str(aa)}) outfile = open('fda_hcv_coords.fa', 'w') for h, s in fasta: for gene, refseq in h77.iteritems(): aquery, aref, ascore = hyphyAlign.pair_align(hyphy, refseq, s)
elif i[0] == '>' or i[0] == '#': if len(sequence) > 0: blocks.append([h, sequence]) sequence = '' # reset containers h = i.strip('\n')[1:] else: h = i.strip('\n')[1:] else: sequence += i.strip('\n') try: blocks.append([h,sequence]) # handle last entry except: raise Exception("convert_fasta(): Error appending to blocks [{},{}]".format(h, sequence)) return blocks hyphy = HyPhy._THyPhy(os.getcwd(), 1) # @UndefinedVariable dump = hyphy.ExecuteBF('MESSAGE_LOGGING = 0;', False) hyphyAlign.change_settings(hyphy, alphabet = hyphyAlign.nucAlphabet, scoreMatrix = hyphyAlign.nucScoreMatrix, gapOpen = 20, gapOpen2 = 20, gapExtend = 10, gapExtend2 = 10, noTerminalPenalty = 1) with open('HCV_REF_2012_genome.fasta', 'rU') as handle: genomes = convert_fasta(handle) # keep one per genotype
sequence = '' # reset containers h = i.strip('\n')[1:] else: h = i.strip('\n')[1:] else: sequence += i.strip('\n') try: blocks.append([h, sequence]) # handle last entry except: raise Exception( "convert_fasta(): Error appending to blocks [{},{}]".format( h, sequence)) return blocks hyphy = HyPhy._THyPhy(os.getcwd(), 1) # @UndefinedVariable dump = hyphy.ExecuteBF('MESSAGE_LOGGING = 0;', False) hyphyAlign.change_settings(hyphy, alphabet=hyphyAlign.nucAlphabet, scoreMatrix=hyphyAlign.nucScoreMatrix, gapOpen=20, gapOpen2=20, gapExtend=10, gapExtend2=10, noTerminalPenalty=1) with open('HCV_REF_2012_genome.fasta', 'rU') as handle: genomes = convert_fasta(handle) # keep one per genotype
# import the HyPhy library # and standard OS utilities import os, HyPhy from six import print_ as print # first, create a HyPhy interface instance (class _THyPhy) # the first argument defines the root directory for HyPhy # and the second - how many threads the computational core # should spawn hyphyInstance = HyPhy._THyPhy (os.getcwd(),2) # the basic interface command is 'ExecuteBF' which # executes HyPhy batch language commands in HyPhy # and returns a string representation of the return value # (if any) from HYPHY # The returned object is of type _THyPhyString with # sData and sLength fields # HyPhy will take care of disposing of the memory needed # to store the result hyphyResult = hyphyInstance.ExecuteBF ("return 2+2;"); print("Testing a trivial HyPhy command. 2+2 = ", hyphyResult.sData) # an optional second argument to ExecuteBF # can be used to "flush" the current state of the system # this is the default option for the call of ExecuteBF # passing the second argument of False or 0 will preserve # the execution state
def csf2counts (path,mode,mixture_cutoffs,amino_reference_sequence="/usr/local/share/miseq/refs/csf2counts_amino_refseqs.csv"): """ Calculate HXB2-aligned nucleotide and amino acid counts from a CSF. """ import csv, logging, HyPhy, os, sys from hyphyAlign import change_settings, get_boundaries, pair_align from miseqUtils import ambig_dict, convert_csf, convert_fasta, mixture_dict, translate_nuc logger = logging.getLogger() hyphy = HyPhy._THyPhy (os.getcwd(), 1) change_settings(hyphy) # default gap open penalty 40(20), extension penalty 10(5) - we may need to change these amino_alphabet = 'ACDEFGHIKLMNPQRSTVWY*' if mode not in ['Amplicon', 'Nextera']: return logger.error("{} is an unsupported mode - halting csf2counts".format(mode)) # set up file paths filename = os.path.basename(path) root = os.path.dirname(path) if os.path.dirname(path) != '' else '.' file_prefix = filename.replace('.csf', '') outpath = root+'/'+file_prefix#"{}/{}".format(root, file_prefix) # CSF contains sample + region in filename (Ex: F00844_S68.HIV1B-pol.0.csf) sample, ref = filename.split('.')[:2] # Amino reference sequences in refseqs is used to coordinate normalize our samples with open(amino_reference_sequence, "rb") as f: input_file = csv.reader(f) refseqs = {} for row in input_file: region, amino = row refseqs[region] = amino # If we have no reference sequence, we can't align the input sequences if ref not in refseqs: logger.error("No reference for {} - halting csf2counts".format(ref)) return refseq = refseqs[ref] # Load CSF (CSF header, offset, sequence) into fasta data structure with open(path, 'rU') as infile: fasta, lefts, rights = convert_csf(infile.readlines()) if len(fasta) == 0: # skip empty file logger.error('{} is an empty file'.format(filename)) return # CSFs come from self-alignment derived SAMs: the reads are out of frame. frame_evidence = {} for frame in range(3): frame_evidence[frame] = 0 # Look at first five reads in CSF and vote on correct ORF for read_index in range(min(5, len(fasta))): # make this robust to having fewer than 5 reads header, seq = fasta[read_index] max_score = -999 best_ORF = 0 possible_ORFs = [0, 1, 2] # Determine best ORF for this read prefix = ('-'*lefts[header] if mode == 'Nextera' else '') for frame in possible_ORFs: p = translate_nuc(prefix + seq, frame) aquery, aref, ascore = pair_align(hyphy, refseq, p) if ascore > max_score: best_ORF = frame max_score = ascore # Read provides 1 of 5 votes for best ORF frame_evidence[best_ORF] += 1 best_frame = max(frame_evidence, key=lambda n: frame_evidence[n]) logging.debug('Best ORF = %d' % best_frame)#logging.debug("Best ORF = {}".format(best_frame)) nuc_counts = {} # Base counts by self-consensus coordinate aa_counts = {} # Amino counts by self-consensus coordinate pcache = [] # Cache protein sequences # CSF reads aligned against self-consensus: offset is with respect to self # For each sequence in the csf for i, (header, seq) in enumerate(fasta): # Determine the offset (Amplicon runs have no offset) left = lefts[header] if mode == 'Nextera' else 0 # Amplicons store read counts in the CSF header count = 1 if mode == 'Nextera' else int(header.split('_')[1]) # Determine nuc counts with respect to self-consensus coordinates for j, nuc in enumerate(seq): pos = left + j if pos not in nuc_counts: nuc_counts.update({pos: {}}) if nuc not in nuc_counts[pos]: nuc_counts[pos].update({nuc: 0}) nuc_counts[pos][nuc] += count # Determine amino counts with respect to self-consensus coordinates p = translate_nuc('-'*left + seq, best_frame) pcache.append(p) # boundaries of read in amino acid space aa_left = (left + best_frame) / 3 aa_right = (rights[header] + left + best_frame) / 3 for pos, aa in enumerate(p): # Do not store gap information #if aa == '-': # continue if pos < aa_left or pos >= aa_right: continue if pos not in aa_counts: aa_counts.update({pos: {}}) if aa not in aa_counts[pos]: aa_counts[pos].update({aa: 0}) aa_counts[pos][aa] += count # Generate amino plurality consensus for query to reference coordinate mapping aa_coords = aa_counts.keys() aa_coords.sort() aa_max = '' for pos in range(min(aa_coords), max(aa_coords)+1): if pos in aa_coords: intermed = [(aa_count, amino) for amino, aa_count in aa_counts[pos].iteritems()] intermed.sort(reverse=True) aa_max += intermed[0][1] else: aa_max += '?' # no coverage but not a gap logger.debug('Amino plurality consensus = ' + aa_max)#logger.debug("Amino plurality consensus = {}".format(aa_max)) aquery, aref, ascore = pair_align(hyphy, refseq, aa_max) left, right = get_boundaries(aref) # Coords of first/last non-gap character logger.debug('Aligned amino plurality conseq = ' + aquery)#logger.debug("Aligned amino plurality conseq = {}".format(aquery)) logger.debug('Aligned reference sequence = ' + aref)#logger.debug("Aligned reference sequence = {}".format(aref)) qindex_to_refcoord = {} # Query <-> reference coordinate mapping inserts = [] # Keep track of which aa positions are insertions qindex = 0 # Where we are in the query? rindex = 0 # Where we are in the reference? ref_coords = range(len(aref)) # For each coordinate on the reference, create a mapping to the query for i in ref_coords: # Do not consider parts of the query outside of the reference if i < left: qindex += 1 elif i >= right: break # A gap in the reference is an insertion in the query which we want to skip in the mapping elif aref[i] == '-': inserts.append(qindex) # Store insert location in query coordinate space qindex += 1 # Track along the query # If theres a gap in the query we are only effectively tracking along the pre-alignment reference elif aquery[i] == '-': rindex += 1 # Normal case: tracking forward on both sequences else: qindex_to_refcoord[qindex] = rindex #qindex_to_refcoord.update({qindex: rindex}) qindex += 1 rindex += 1 #print i, rindex, aref[i], qindex, aquery[i] logger.debug('qindex_to_refcoord: ' + str(qindex_to_refcoord))#"qindex_to_refcoord {}".format(qindex_to_refcoord)) # Write inserts to an indels.csv file if len(inserts) > 0: with open(outpath+".indels.csv", 'w') as indelfile: indelfile.write('insertion,count\n') indel_counts = {} for p in pcache: ins_str = str(inserts[0]) last_i = -1 for i in inserts: if last_i > -1 and i - last_i > 1: # end of a contiguous indel ins_str += ',%d' % i try: ins_str += p[i] except IndexError: break last_i = i if not indel_counts.has_key(ins_str): indel_counts.update({ins_str: 0}) indel_counts[ins_str] += 1 for ins_str, count in indel_counts.iteritems(): indelfile.write('%s,%d\n' % (ins_str, count)) # Initialize initial (blank) consensus sequence for each mixture rule maxcon = '' conseqs = ['' for cut in mixture_cutoffs] query_codon_pos = 0 nuc_coords = nuc_counts.keys() # nucs[self-coord][nuc] = count nuc_coords.sort() # account for assembly offset due to extra bases in sample-specific consensus nuc_assembly_offset = min(lefts.values()) # Output nucleotide counts in reference coordinate space to nuc.csv files nucfile = open(outpath+'.nuc.freqs', 'w')#open("{}.nuc.csv".format(outpath), 'w') nucfile.write("query.nuc.pos,refSeq.nuc.pos,A,C,G,T\n") for query_nuc_pos in nuc_coords: nucleotide_counts = [nuc_counts[query_nuc_pos].get(nuc, 0) for nuc in 'ACGT'] nucleotide_counts_string = ','.join(map(str, nucleotide_counts)) # Convert nucleotide query index into reference index try: # best frame is adjusted by shift from query to assembly coordinates adjustment = best_frame - (3 - nuc_assembly_offset%3)%3 query_aa_pos = (query_nuc_pos - nuc_assembly_offset + adjustment) / 3 query_codon_pos = (query_nuc_pos - nuc_assembly_offset + adjustment) % 3 ref_aa_pos = qindex_to_refcoord[query_aa_pos] ref_nuc_pos = 3*ref_aa_pos + query_codon_pos nucfile.write(','.join(map(str, [query_nuc_pos+1, ref_nuc_pos+1, nucleotide_counts_string])))#"{},{},{}\n".format(query_nuc_pos, ref_nuc_pos, nucleotide_counts_string)) nucfile.write('\n') except KeyError: #logger.debug("No coordinate mapping for query nuc {} / amino {} ({})".format(query_nuc_pos, query_aa_pos, filename)) logger.debug('No coordinate mapping for query nuc %d / amino %d (%s)' % (query_nuc_pos, query_aa_pos, filename)) continue # Store self-aligned nucleotide plurality conseqs intermed = [(count, nuc) for nuc, count in nuc_counts[query_nuc_pos].iteritems()] intermed.sort(reverse=True) maxcon += intermed[0][1] # Determine the number of bases in total at this query position total_count = sum([count for count, nuc in intermed]) for ci, mixture_cutoff in enumerate(mixture_cutoffs): mixture = [] # If a base is greater than the proportion cutoff, the base contributes for count, nuc in intermed: if float(count) / total_count > mixture_cutoff: mixture.append(nuc) # If an N exists with other bases, those bases take precedence if 'N' in mixture: if len(mixture) > 1: mixture.remove('N') else: conseqs[ci] += 'N' #logger.debug("N was the majority base at position {} - {} (mixture_cutoff = {})".format(query_nuc_pos, filename, mixture_cutoff)) logger.debug("N was the majority base at position %d - %s (mixture_cutoff = %f)" % (query_nuc_pos, filename, mixture_cutoff)) continue # If there is a gap, but also bases, those bases take precedence if '-' in mixture: if len(mixture) > 1: mixture.remove('-') else: conseqs[ci] += '-' continue # Attach mixture (If one exists) to the conseq with appropriate mixture cutoff rule if len(mixture) > 1: mixture.sort() conseqs[ci] += ambig_dict[''.join(mixture)] elif len(mixture) == 1: conseqs[ci] += mixture[0] else: # Mixture of length zero, no bases exceed cutoff conseqs[ci] += 'N' nucfile.close() # Store self-aligned plurality amino sequences in .conseq files #with open("{}.conseq".format(outpath), 'w') as confile: with open(outpath+'.conseq', 'w') as confile: confile.write('>%s_MAX\n%s\n' % (sample, maxcon)) for ci, cutoff in enumerate(mixture_cutoffs): confile.write('>%s_%1.3f\n%s\n' % (sample, cutoff, conseqs[ci])) # Write amino acid counts in reference coordinate space in amino.csv files #with open("{}.amino.csv".format(outpath), 'w') as aafile: with open(outpath+".amino.freqs", 'w') as aafile: aafile.write("query.aa.pos,refseq.aa.pos,%s\n" % (','.join(list(amino_alphabet)))) for qindex, ref_aa_pos in qindex_to_refcoord.iteritems(): # adjust for assembly offset aa_pos = qindex + min(aa_coords) # Ignore query inserts if aa_pos in inserts: logger.debug("%d is an insert - ignoring" % (aa_pos)) continue try: #ref_aa_pos = qindex_to_refcoord[aa_pos] + 1 # FIXME: DO WE NEED TO ADD 1? aa_counts_string = ','.join(map(str, [aa_counts[aa_pos].get(aa, 0) for aa in amino_alphabet])) # note that we are subtracting the minimum aa_counts key aafile.write('%d,%d,%s\n' % (aa_pos, ref_aa_pos+1, aa_counts_string)) except KeyError: logger.debug("No query-ref mapping available for aapos=%d (%s)" % (aa_pos, filename))
def g2p_scoring(csf_path, g2p_alignment_cutoff): """ Take an env (amplicon) CSF and generate a v3prot file. Header: contains the G2P FPR and read count Sequence: protein aligned V3 The CSF must be from an amplicon run: column 1 must contain the rank + count. """ import logging,os,sys from hyphyAlign import apply2nuc, change_settings, get_boundaries, HyPhy, pair_align, refSeqs from minG2P import conan_g2p from miseqUtils import translate_nuc csf_filename = os.path.basename(csf_path) prefix = csf_filename.split('.')[0] logger = logging.getLogger() hyphy = HyPhy._THyPhy (os.getcwd(), 1) # HyPhy is used for alignment change_settings(hyphy) # Configure scoring matrix / gap penalties refseq = translate_nuc(refSeqs['V3, clinical'], 0) # V3 ref seq is NON-STANDARD: talk to Guin if csf_filename.find("HIV1B-env") == -1 or not csf_path.endswith('.csf'): return logger.error("{} is not an HIV1B-env CSF file".format(csf_filename)) # Store CSF in fasta-like variable called sequences sequences = [] with open(csf_path, 'rU') as csf_file: for line in csf_file: header, left_offset, seq_no_gaps = line.strip("\n").split(",") sequences.append((header, seq_no_gaps)) if len(sequences) == 0: # skip empty file return logger.error('%s is an empty file' % csf_filename) # Determine offset from 1st sequence to correct frameshift induced by sample-specific remapping seq1 = sequences[0][1].strip("-") best_offset = 0 best_score = -999 possible_ORFs = [0, 1, 2] for offset in possible_ORFs: aaEnvSeq = translate_nuc(seq1, offset) aquery, aref, ascore = pair_align(hyphy, refseq, aaEnvSeq) if ascore > best_score: best_offset = offset best_score = ascore # For each env sequence, extract the V3 nucleotide sequence badfile = open(csf_path.replace('.csf', '.badV3'), 'w') v3nucs = {} for header, seq in sequences: count = int(header.split('_')[-1]) seq = seq.replace("-","") # Strip dashes at flanking regions generated by alignment aaEnvSeq = translate_nuc(seq, best_offset) # Translate env on correct ORF aquery, aref, ascore = pair_align(hyphy, refseq, aaEnvSeq) left, right = get_boundaries(aref) # Get left/right boundaries of V3 protein v3prot = aquery[left:right] # Extract V3 protein v3nuc = apply2nuc(seq[(3*left-best_offset):], v3prot, # Use alignment to extract V3 nuc seq aref[left:right], keepIns=True, keepDel=False) # Drop V3 data that don't satisfy quality control if 'N' in v3nuc or not v3prot.startswith('C') or not v3prot.endswith('C') or '*' in v3prot or ascore < g2p_alignment_cutoff or len(v3prot) < 32 or len(v3prot) > 40: badfile.write('>%s_reason_%s\n%s\n' % (header, '|'.join(['stopcodon' if '*' in v3prot else '', # V3 can't have internal stop codon 'lowscore' if ascore < g2p_alignment_cutoff else '', # The G2P alignment can't be poor 'cystines' if not v3prot.startswith('C') or not v3prot.endswith('C') else '', # V3 must start/end with C 'ambig' if 'N' in v3nuc else '']),seq)) # There must be no unknown bases else: # Track the count of each v3 nucleotide sequence if v3nucs.has_key(v3nuc): v3nucs[v3nuc] += count else: v3nucs.update({v3nuc: count}) badfile.close() # Calculate g2p scores for each v3 nuc sequence v3prots = {} for v3nuc, count in v3nucs.iteritems(): g2p, fpr, aligned = conan_g2p(v3nuc) if g2p is None: continue # Track the count of each protein sequence if v3prots.has_key(aligned): v3prots[aligned]['count'] += count else: # Dict within dict - store count and fpr for each sequence v3prots.update({aligned: {'count': count, 'fpr': fpr}}) # Collect v3 prot sequences and their output (v is a dict mapping to count and fpr) intermed = [(v['count'], v3prot) for v3prot, v in v3prots.iteritems()] intermed.sort(reverse=True) # For this sample, write a v3prot file containing the prefix, sequence, rank, count, and fpr v3prot_path = csf_path.replace('.csf', '.v3prot') logger.info("Writing results to {}".format(v3prot_path)) with open(v3prot_path, 'w') as v3protfile: for i, (count, v3prot) in enumerate(intermed): fpr = v3prots[v3prot]['fpr'] v3protfile.write(">{}_variant_{}_count_{}_fpr_{}\n{}\n".format(prefix, i, count, fpr, v3prot))