def binarize_tree(self, next_id): oid = next_id tree = self.parse amr = self.amr # handle all-terminal rules if not any(s[0] == '#' for s in tree.leaves()): return [VoRule(next_id, self.symbol, self.weight, self.amr, self.parse, self.rhs1_visit_order, self.rhs2_visit_order)], next_id + 1 # handle rules containing nonterminals rules = [] try: tree, amr, at_rules, next_id = self.collapse_amr_terminals(tree, amr, next_id) rules += at_rules tree, amr, ts_rules, next_id = self.merge_tree_symbols(tree, amr, next_id) rules += ts_rules except BinarizationException: log.warn('Unbinarizable rule!') return None, oid # sanity check as above assert isinstance(tree, str) assert len(amr.triples()) == 1 rules.append(VoRule(next_id + 1, self.symbol, self.weight, amr, tree)) return rules, next_id + 2
def binarize(self, next_id): oid = next_id tree = self.parse amr = self.amr # handle all-terminal rules if not any(s[0] == '#' for s in tree.leaves()): return [VoRule(next_id, self.symbol, self.weight, self.amr, self.parse, self.rhs1_visit_order, self.rhs2_visit_order)], next_id + 1 # handle rules containing nonterminals rules = [] try: tree, amr, at_rules, next_id = self.collapse_amr_terminals(tree, amr, next_id) rules += at_rules string = tree.leaves() string, amr, st_rules, next_id = self.collapse_string_terminals(string, amr, next_id) rules += st_rules string, amr, nt_rules, next_id = self.merge_string_nonterminals(string, amr, next_id) rules += nt_rules except BinarizationException: log.warn('Unbinarizable rule!') return None, oid # sanity check---did we completely binarize the rule? assert len(string) == 1 assert len(amr.triples()) == 1 rules.append(VoRule(next_id + 1, self.symbol, self.weight, amr, string[0])) return rules, next_id + 2
def backup_redo_log(self, save_path): dir_log_source = 'SYSTEMDB' if self.target_db.upper( ) == 'SYSTEMDB' else 'DB_{}'.format(self.target_db.upper()) default_log_path = '{}/backup/log/{}'.format(self.dir_instance, dir_log_source) cp_redo_log = "su - {} -c 'cp {}/* {}'".format(self.hana_adm, default_log_path, save_path) try_times = 3 try: while try_times <= 3: result = exec_cmd2(cp_redo_log) status = result['ret'] output = result['msg'].strip() if status == 0: return True else: log.warn('[TASK_ID:' + str(self.task_id) + '] backup_redo_log cmd fail! dump_cmd:' + cp_redo_log + 'status:' + str(status) + ' output:' + output) exec_cmd2('rm -rf {}/log_*'.format(self.backup_dir)) return False except Exception as ex: error() log.error('[backup_redo_log]' + str(type(ex)) + ":" + str(ex)) return False
def break_clusters(self, clusters, *args, **kwargs): log.debug('Breaking clusters with:\n{}'.format(str(self))) result = [] for i, cluster in enumerate(clusters): if self.to_break(cluster): try: sub_clusters = self.break_cluster(cluster, *args, **kwargs) if not sub_clusters: log.warn('Cluster {} not broken'.format(cluster.id)) result.append(cluster) else: log.info( 'Breaking cluster {} into {} sub_clusters'.format( cluster.id, len(sub_clusters))) result.extend(sub_clusters) except (lpinterface.NoSolutionsError, UnboundLocalError, TypeError, ValueError) as e: log.error( 'Cluster breaking failed for cluster {} - see log'. format(cluster.id)) log.debug(sys.exc_info()) result.append(cluster) else: result.append(cluster) return result
def iterate_fasta( filename='../database/V-QUEST-reference-allele-db+no-period-references.clustalw.fasta' ): def strip_fasta_ID( s): # strips out allele name from the IMGT fasta naming convention s = s.split('|') return (s[1], s[3], s[0]) # (name, functional_value, accession) if not os.path.exists(filename): log.info('Fetching IMGT V gene reference nucleotide sequences') if not fetch_reference( 'IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP' ): log.error( 'Unable to find/fetch from IMGT V gene nucleotide references') os.sys.exit() consensus = '' current_gene = None return_db = OrderedDict() consensus = get_consensus().lower().replace('-', '.') for record in SeqIO.parse(filename, 'fasta'): allele, functional, accession = strip_fasta_ID(record.description) # if allele != 'IGHV1-18*01': # continue # log.debug('\nAllele {}'.format(allele)) seq = str(record.seq).lower().replace('-', '.') # log.debug('Allele: '+ allele) if not current_gene or allele.split( '*')[0] != current_gene: # first entry in iteration current_gene = allele.split('*')[0] # return_db[current_gene] = OrderedDict({'alleles': OrderedDict()}) # log.debug('allele seq:\n{}\nConsensus seq:\n{}'.format(seq, consensus)) length = len(seq.replace('.', '')) variants, seq, msg = call_variants.get_variants(seq, consensus) # log.debug('Has {} variants'.format(len(variants))) if msg: log.warn('\nAllele {}'.format(allele)) common.log_msg(msg) return_db[allele] = OrderedDict({ 'imgt_accession': accession, 'functional': functional, 'seq': seq, 'length': length }) if variants: # log.debug(variants) if __name__ != '__main__': variants = sets.Set([(x['pos'], x['op']) for x in variants]) return_db[allele]['variants'] = variants # remove_IMGT_periods(return_db[current_gene]['alleles'][allele]) return return_db
def parse_karolinska_html(page, func): soup = BeautifulSoup(page, 'lxml') data = autodict() dbsnp = collections.defaultdict(lambda: '*') tables = soup.select('table table') if len(tables) == 0 or True: tables = soup.select('table') for table in tables: for row in table.find_all('tr'): # skip header items = func(row, dbsnp) if len(items) < 3 or len(filter(None, items[2])) == 0 or items[0] == 'Allele' or items[0] == '': continue items[0] = items[0].split()[0] if 'X' in items[0] or 'x' in items[0]: log.warn('Karolinska: Ignoring {}', items[0]) continue if items[0] in data: log.warn('Karolinska: {} already exists, overwriting it', items[0]) log.debug('Karolinska: overwriting {} with {}', items[0], ','.join(map(str, items[2]))) data[items[0]].update({ 'mutations': items[2], 'phenotype': { 'invivo': items[-3], 'invitro': items[-2], } }) return data
def check_host_groups(self): """ This method checks if some host group exists """ for item in self.group_list: tenant_name = item[0] payload = { "jsonrpc": "2.0", "method": "hostgroup.exists", "params": { "name": tenant_name }, "auth": self.api_auth, "id": 1 } response = self.contact_zabbix_server(payload) if response['result'] is False: log.warn("Host Group %s does not existed, Creating ... " %tenant_name) payload = {"jsonrpc": "2.0", "method": "hostgroup.create", "params": {"name": tenant_name}, "auth": self.api_auth, "id": 2} self.contact_zabbix_server(payload) else: log.info("Host Group %s has already existed ..." %tenant_name)
def process_mut_helper(m, dbsnp): result = [] rs = re.findall(r'\s*?([-0-9_]+)\s*?((ins|del|dup|[ACGT])(\S*))', ' ' + m) # Match 100C>T, -100C>T, -100 C>T etc. if len(rs) == 0: return None for r in rs: pos, op = int(r[0].split('_')[0]), r[1].split()[0] op = op.replace('*', '') if len(op) == 1: # e.g. 51A op = 'A>{}'.format(op) elif '>' in op and len(op) != 3: # e.g. -1601_-1600GA>TT; op = op.split('>') assert (len(op) == 2) if len(op[0]) == len(op[1]): result += [[ pos + i, '{}>{}'.format(op[0][i], op[1][i]), dbsnp ] for i in xrange(len(op[1])) if op[0][i] != op[1][i]] continue else: # e.g. 3030G>G/A if '/' in op[1] and len(op[0]) == 1: result += [[pos, '{}>{}'.format(op[0], c), dbsnp] for c in op[1].split('/') if op[0] != c] else: log.warn('Main: Ignoring {}', m) continue if op[:3] == 'dup': op = 'ins' + op[3:] if op[-2:] == 'x2': # detect ins<something>x2 op = op[:-2] + op[3:-2] result.append([pos, op, dbsnp]) return result
def test_scip(name): try: model = SCIP(name) log.warn('Using SCIP') except ImportError: log.warn( 'SCIP not found. Please install SCIP and pyscipopt Python package.' ) model = None return model
def test_gurobi(name): try: model = Gurobi(name) log.warn('Using Gurobi') except ImportError: log.warn( 'Gurobi not found. Please install Gurobi and gurobipy Python package.' ) model = None return model
def em_step(self, corpus, parser_class, normalization_groups, bitext=False): """ Perform a single step of EM on the """ ll = 0.0 counts = defaultdict(float) parser = parser_class(self) if bitext: if parser_class == ParserTD: log.err( "Bigraph parsing with tree decomposition based parser is not yet implemented. Use '-p basic'." ) sys.exit(1) parse_generator = parser.parse_bitexts(corpus) else: if self.rhs1_type == "string": if parser_class == ParserTD: log.err( "Parser class needs to be 'basic' to parse strings.") sys.exit(1) else: parse_generator = parser.parse_strings(corpus) else: parse_generator = parser.parse_graphs(corpus) i = 0 for chart in parse_generator: i += 1 if not chart: log.warn("No parse for sentence %d." % i) continue inside_probs = chart.inside_scores() outside_probs = chart.outside_scores(inside_probs) ll += inside_probs["START"] counts_for_graph = chart.expected_rule_counts( inside_probs, outside_probs) for r in counts_for_graph: counts[r] = counts[r] + counts_for_graph[r] for r in counts: if r in counts: self[r].weight = counts[r] else: self[r].weight = LOGZERO self.normalize_by_groups(normalization_groups) return ll
def warn_candidates(self, cluster): from common import get_columns ground_truth = [x.id.split('_')[0] for x in cluster] count = get_columns(Counter(ground_truth).most_common()) missing = set(ground_truth).difference( set([x.id for x in cluster.candidates])) if missing: log.warn( '---\nCluster {}\nMissing {} from candidates\nGround Truth:\n{}' .format(cluster.id, str(missing), count))
def __init__(self, servers, **runner_args): self.servers = ' '.join(servers).split(' ') self.runner_args = runner_args for command in commands: if hasattr(self, command.module_name): log.warn('{} conflicts with existing attribute'.format( command.name)) continue run = lambda **arguments: command.execute(self.servers, arguments) setattr(self, command.module_name, run)
def __init__(self, servers, **runner_args): self.servers = ' '.join(servers).split(' ') self.runner_args = runner_args for command in commands: if hasattr(self, command.module_name): log.warn('{} conflicts with existing attribute'.format( command.name )) continue run = lambda **arguments: command.execute(self.servers, arguments) setattr(self, command.module_name, run)
def get_item_data(self,sample): #print "Pool Gthead Num:",self.pool.running() item_data = {'host':'','key':'error','value':''} try: resource_id = sample['resource_id'] counter_name = sample['counter_name'] counter_volume = sample['counter_volume'] #got resource_id if resource_id.split('-')[0] == 'instance': resource_id = sample['resource_metadata']['instance_id'] #got item key counter_name = transfer_item_key(counter_name) if counter_name in self.moniter_items: item_data = {'host':resource_id,'key':counter_name,'value':counter_volume} except Exception,e: log.warn(str(e))
def ceilometer_callback(self, ch, method, properties, body): """ Method used by method ceilometer_amq() to filter messages by type of message. :param ch: refers to the head of the protocol :param method: refers to the method used in callback :param properties: refers to the proprieties of the message :param body: refers to the message transmitted """ payload = json.loads(body) try: message_body = json.loads(payload["oslo.message"]) samples = message_body["args"]["data"] # print "--------------------------------------------------" self.pool.spawn_n(self.zabbix_sender.consume_samples, samples) except Exception, e: log.warn(str(e))
def em_step(self, corpus, parser_class, normalization_groups, bitext = False): """ Perform a single step of EM on the """ ll = 0.0 counts = defaultdict(float) parser = parser_class(self) if bitext: if parser_class == ParserTD: log.err("Bigraph parsing with tree decomposition based parser is not yet implemented. Use '-p basic'.") sys.exit(1) parse_generator = parser.parse_bitexts(corpus) else: if self.rhs1_type == "string": if parser_class == ParserTD: log.err("Parser class needs to be 'basic' to parse strings.") sys.exit(1) else: parse_generator = parser.parse_strings(corpus) else: parse_generator = parser.parse_graphs(corpus) i = 0 for chart in parse_generator: i += 1 if not chart: log.warn("No parse for sentence %d." % i) continue inside_probs = chart.inside_scores() outside_probs = chart.outside_scores(inside_probs) ll += inside_probs["START"] counts_for_graph = chart.expected_rule_counts(inside_probs, outside_probs) for r in counts_for_graph: counts[r] = counts[r] + counts_for_graph[r] for r in counts: if r in counts: self[r].weight = counts[r] else: self[r].weight = LOGZERO self.normalize_by_groups(normalization_groups) return ll
def set_lsf(cfg): """ Set LSF specific :py:data:`~lrms.common.Config` attributes. :param cfg: parsed arc.conf :type cfg: :py:class:`ConfigParser.ConfigParser` """ Config.lsf_bin_path = str(cfg.get('common', 'lsf_bin_path')).strip('"') if cfg.has_option('common', 'lsf_bin_path') else '/usr/bin' if cfg.has_option('common', 'lsf_profile_path'): Config.lsf_setup = 'source %s &&' % str(cfg.get('common', 'lsf_profile_path')).strip('"') else: warn('lsf_profile_path not set in arc.conf', 'lsf') Config.lsf_setup = '' Config.localtransfer = False Config.lsf_architecture = str(cfg.get('common', 'lsf_architecture')).strip('"') if cfg.has_option('common', 'lsf_architecture') else ''
def fix_candidate_rev_comp(self, cluster): mappings = self.minimap.run( [SeqRecord('cons', cluster.consensus.replace('.', ''))], cluster.candidates, params=self.minimap.params + ' -N{}'.format(len(cluster.candidates))) mappings = dict([(m.tName, m) for m in mappings]) for c in cluster.candidates: try: if mappings[c.id].strand == '-': c.seq = str(Seq(c.seq).reverse_complement()) log.debug('Reverse complemented candidate {}'.format(c.id)) except KeyError: log.warn( 'Breaking cluster {}: candidate {} not in rev-compl mapping. Read support: {}' .format(cluster.id, c.id, c.read_mapping_support))
def hana_db_backup(self): try: config = getconf() try_times = 0 save_path = os.path.join(self.backup_dir, self.target_db) if self.backup_mode == config.DB_BACKUP_TYPE_FULL: self.full_backup_clear_old_log() backup_command = r"\"backup data for {} using file ('{}/full')\"".format( self.target_db, save_path) else: backup_command = r"\"backup data DIFFERENTIAL for {} using file ('{}/diff')\"".format( self.target_db, save_path) exec_command = self.system_db_exec_command_str(backup_command) exec_command_log = exec_command.replace( r'-p \"{}\"'.format(self.system_db_pwd), '-p ******') log.info('backup cmd is {}:'.format(exec_command_log)) while try_times < 3: log.debug('[TASK_ID:' + str(self.task_id) + '] hana_db_backup cmd execute. cmd:' + exec_command_log + '') result = exec_cmd2(exec_command) status = result['ret'] output = result['msg'].strip() log.debug('[TASK_ID:' + str(self.task_id) + '] hana_db_backup cmd finish! status:' + str(status) + ' output:' + output) try_times += 1 if status != 0: log.warn('[TASK_ID:' + str(self.task_id) + '] hana_db_backup cmd fail! dump_cmd:' + exec_command_log + 'status:' + str(status) + ' output:' + output) continue if self.backup_mode == config.DB_BACKUP_TYPE_FULL: self.gen_hana_fullback_info_file() return True if self.backup_mode == config.DB_BACKUP_TYPE_DIFF: if self.backup_redo_log(save_path): return True return False except Exception as ex: error() log.error('[HANA_DB_BACKUP]' + str(type(ex)) + ":" + str(ex)) return False
def create_host(self, instance_name, instance_id, tenant_name): """ Method used to create a host in Zabbix server :param instance_name: refers to the instance name :param instance_id: refers to the instance id :param tenant_name: refers to the tenant name """ group_id = self.find_group_id(tenant_name) if not instance_id in instance_name: instance_name = self.zabbix_proxy_name + '_1_' + instance_id log.warn("VM Instance %s does not existed, Creating ... " %instance_name) payload = {"jsonrpc": "2.0", "method": "host.create", "params": { "host": instance_id, "name": instance_name, "proxy_hostid": self.proxy_id, "interfaces": [ { "type": 1, "main": 1, "useip": 1, "ip": "127.0.0.1", "dns": "", "port": "10050"} ], "groups": [ { "groupid": group_id } ], "templates": [ { "templateid": self.template_id } ], }, "auth": self.api_auth, "id": 1} self.contact_zabbix_server(payload)
def get_rates(product, start_dt, end_dt, sec_per_tick): """ Returns the rates with the schema [unix time, low, high, open, close, volume] for the specified product and time interval. """ cur_end = end_dt all_rates = [] while cur_end > start_dt: cur_start = cur_end - timedelta(seconds=sec_per_tick * max_ticks) # We want to stop once we reach the initial start datetime. if cur_start < start_dt: cur_start = start_dt rates, resp = marketdata.get_rates(product, start_dt=cur_start, end_dt=cur_end, sec_per_tick=sec_per_tick) if resp.status_code != 200: # Rate limit, wait another fetch delay if resp.status_code == 429: log.warn('429 Too Many Requests, waiting for {}s to retry...'. format(_fetch_delay)) time.sleep(_fetch_delay) continue log.error('non-200 status code when SCRAPING HISTORICAL RATES') log.error('status code: ' + str(resp.status_code)) log.error('reason: ' + resp.reason) log.error('message: ' + resp.text) break all_rates.extend(rates) # Update cur_end for the next retrieval fetched_start = datetime.utcfromtimestamp(rates[-1][0]) cur_end = fetched_start - timedelta(seconds=sec_per_tick) time.sleep(_fetch_delay) return all_rates
def get_template_id(self): """ Method used to check if the template already exists. If not, creates one :return: returns the template ID """ global template_id payload = { "jsonrpc": "2.0", "method": "template.exists", "params": { "host": self.template_name }, "auth": self.api_auth, "id": 1 } log.info("Getting Platform VM's Template id ....") response = self.contact_zabbix_server(payload) if response['result'] is True: payload = {"jsonrpc": "2.0", "method": "template.get", "params": { "output": "extend", "filter": { "host": [ self.template_name ] } }, "auth": self.api_auth, "id": 1 } response = self.contact_zabbix_server(payload) global template_id for item in response['result']: template_id = item['templateid'] log.info("Template exists ....\n Template id:%s" %template_id) else: log.warn("Template does not exist!!! Creating...") group_id = self.get_group_template_id() template_id = self.create_template(group_id) return template_id
def parse_poa_pir(self, data, cluster=None): output = list(SeqIO.parse(data, 'fasta')) consensus = [x for x in output if 'CONSENS' in x.id] msa = dict([(x.id, str(x.seq)) for x in output if 'CONSENS' not in x.id]) try: cluster.msa = [] # check PIR contains same reads as cluster if len( set([c.id for c in cluster]).symmetric_difference( msa.keys())) > 0: raise IOError # assign MSA sequences to reads for read in cluster: read.alignment = str(msa[read.id]) cluster.msa.append(read.alignment) except AttributeError as e: pass if len(consensus) > 1: try: log.warn( '{} generated alignment for cluster {} returned {} consensus sequences - using only first sequence:' .format(self.name, cluster.id, len(consensus))) except AttributeError as e: log.warn( '{} generated alignment returned {} consensus sequences - using only first sequence' .format(self.name, len(consensus))) log.debug('\n'.join([x.id for x in consensus])) if consensus: consensus = str(consensus[0].seq) try: cluster.consensus = str(consensus) except AttributeError: pass return consensus, msa
def get_proxy_id(self): """ Method used to check if the proxy exists. :return: a control value and the proxy ID if exists """ payload = { "jsonrpc": "2.0", "method": "proxy.get", "params": { "output": "extend" }, "auth": self.api_auth, "id": 1 } response = self.contact_zabbix_server(payload) proxy_id = None log.info("Getting Platform Proxy id ...") for item in response['result']: if item['host'] == self.zabbix_proxy_name: proxy_id = item['proxyid'] break if not proxy_id: ''' Check if proxy exists, if not create one ''' log.warn("Proxy id does not exists, Creating ....") payload = {"jsonrpc": "2.0", "method": "proxy.create", "params": { "host": self.zabbix_proxy_name, "status": "5" }, "auth": self.api_auth, "id": 1 } response = self.contact_zabbix_server(payload) proxy_id = response['result']['proxyids'][0] log.info("Proxy id: %s" %proxy_id) return proxy_id
def load_connections( config: dict, session: Session = None, ): connections = config.get("connections", None) if connections is None: log.info("No connections found, skipping") return log.info("Loading variabels from config...") for key in connections.keys(): val: dict = connections.get(key) if not isinstance(val, dict): log.warn( f"Connection {key} skipped. Value must be a dictionary.") connection = session.query(Connection).filter_by( conn_id=key).first() if connection is not None: log.info(f"Connection exists, skipping: {key}") continue log.info("Setting connection: " + key) extra = val.get("extra", None) if extra is not None and not isinstance(extra, (int, str)): extra = json.dumps(extra) connection = Connection( conn_id=key, conn_type=val.get("conn_type", None), host=val.get("host", None), login=val.get("login", None), password=val.get("password", None), schema=val.get("schema", None), port=val.get("port", None), extra=extra, ) session.add(connection) session.commit()
def get_rates(product, start_dt=None, end_dt=None, sec_per_tick=5): """ Returns the list of ticks with the schema [unix time, low, high, open, close, volume] for the specified DATETIME interval. A maximum of 200 ticks will be returned and missing ticks are possible. By default it returns the last 1000 seconds of historic data with sec_per_tick = 5. Also returns the full response object. """ if end_dt is None: end_dt = datetime.utcnow() if start_dt is None: start_dt = end_dt - timedelta(seconds=max_ticks * sec_per_tick) params = { 'start': start_dt.isoformat(), 'end': end_dt.isoformat(), 'granularity': sec_per_tick, } log.info('getting HISTORIC RATES') resp = httpapi.get( common.api_url + 'products/' + product + '/candles', params=params, auth=common.auth, ) rates = resp.json() if resp.status_code == 200 and len(rates) > 0: fetched_start = datetime.utcfromtimestamp(rates[-1][0]) fetched_end = datetime.utcfromtimestamp(rates[0][0]) # TODO(richardwu): For some reason GDAX over-extends and returns # values < start_dt. # assert(fetched_start >= start_dt) if fetched_start < start_dt: log.warn('KNOWN BUG: HISTORIC RATES fetched_start < cur_start') log.warn('fetched_start: ' + fetched_start.isoformat()) log.warn('start_dt: ' + start_dt.isoformat()) assert(fetched_end <= end_dt) return rates, resp
def call_cluster(self, cluster, filter_function=None, result_filter=None, temp_file_path=None): import tempfile if len(cluster) == 1: log.warn('Cluster {} has single read, not calling'.format( cluster.id)) try: cluster.consensus_seq = None cluster.consensus_builder = None cluster.set_call(None) cluster.candidates = None cluster.candidates_method = str(self) except AttributeError as e: pass finally: return None consensus_seq = None consensus_seq_id = None f = None is_cluster_inst = False # flag for filling descriptive attributes if hasattr(cluster, '__getitem__' ): # assumed to be list of sequences, get consensus try: if temp_file_path: with open(temp_file_path, 'wb') as f: f.write( fasta_from_seq(*zip(*[(x.id, x.seq) for x in cluster]))) consensus_seq = self.consensus_builder.generate_consensus( temp_file_path if temp_file_path else cluster) if not consensus_seq: cluster.consensus = None cluster.candidates_method = str(self) return consensus_seq_id = 'cons' log.info('Generated consensus with:\n{}'.format( str(self.consensus_builder))) log.debug('Output:\n{}'.format(consensus_seq)) try: cluster.consensus = consensus_seq cluster.consensus_method = str(self.consensus_builder) except AttributeError as e: pass except TypeError as e: ## No consensus builder is set raise ValueError( 'Cluster calling: list of cluster sequences provided but no consensus builder instantiated.' ) else: if isinstance(cluster, basestring): # input is path if os.path.exists(cluster): cons_path = cluster else: raise ValueError( 'Cluster calling input invalid. String provided but is not valid path. If trying to cast as Bio.Seq.Seq-like object' ) else: # input is consensus seq consensus_seq = cluster.seq consensus_seq_id = cluster.id ## save blasr target in all cases except path as input if consensus_seq: try: f = open( temp_file_path, 'wb+') if temp_file_path else tempfile.NamedTemporaryFile( delete=False) f.write(str(fasta_from_seq(consensus_seq_id, consensus_seq))) cons_path = f.name f.close() except AttributeError as e: raise ValueError( 'Cluster calling input invalid. Provide iterable of cluster sequences, path to cluster consensus or Bio.Seq.Seq-like object to call' ) ## run blasr mapping of consensus_seq against allele database command = [self.blasr.src, '', self.allele_db, cons_path] try: mapping_output = self.blasr.run(*command) except ValueError as e: log.warn('Blasr returned no mapping') try: cluster.set_call(None) cluster.candidates = None cluster.candidates_method = str(self) except AttributeError as e: pass finally: return None f.close() ## select from mapping the desired result as the call if not filter_function: filter_function = self.filter_function try: mapping_output = sorted(mapping_output, key=filter_function) cluster_call = mapping_output[0] except ValueError as e: log.error('Invalid blasr mapping value') log.debug('\n'.join([str(x) for x in mapping_output])) raise e if not result_filter: result_filter = self.result_filter result = result_filter(cluster_call) try: cluster.set_call([result]) cluster.candidates = list(mapping_output) cluster.candidates_method = str(self) except AttributeError as e: return result
def Scan(config, ctr_dirs): """ Query the LSF host for all jobs in /[controldir]/processing with ``bjobs``. If the job has stopped running, the exit code is read and the diagnostics and comments files are updated. Finally ``gm-kick`` is executed on all jobs with an exit code. If the exit code can not be read from the diagnostics file, it will (after 5 tries) be kicked with status UNKNOWN. :param str config: path to arc.conf :param ctr_dirs: list of paths to control directories :type ctr_dirs: :py:obj:`list` [ :py:obj:`str` ... ] """ configure(config, set_lsf) if Config.scanscriptlog: scanlogfile = arc.common.LogFile(Config.scanscriptlog) arc.common.Logger_getRootLogger().addDestination(scanlogfile) arc.common.Logger_getRootLogger().setThreshold(Config.log_threshold) jobs = get_jobs(ctr_dirs) if not jobs: return if Config.remote_host: # NOTE: Assuming 256 B of TCP window needed for each job ssh_connect(Config.remote_host, Config.remote_user, Config.private_key, (2 << 7)*len(jobs)) lsf_bin_path = Config.lsf_bin_path execute = excute_local if not Config.remote_host else execute_remote args = Config.lsf_setup + ' ' + lsf_bin_path + '/bjobs -w -W ' + ' '.join(jobs.keys()) if os.environ.has_key('__LSF_TEST'): handle = execute(args, env = dict(os.environ)) else: handle = execute(args) def handle_job(info, in_lsf = True): job = jobs[info[0]] job.state = info[2] if job.state in RUNNING: if os.path.exists(job.count_file): os.remove(job.count_file) return if set_exit_code_from_diag(job): if in_lsf: start, end = info[-2:] re_date = re.compile(r'^(?P<mm>\d\d)/(?P<dd>\d\d)-(?P<HH>\d\d):(?P<MM>\d\d)') job.LRMSStartTime = arc.common.Time(get_MDS(re_date.match(start).groupdict())) if end != '-': job.LRMSEndTime = arc.common.Time(get_MDS(re_date.match(end).groupdict())) job.WallTime = job.LRMSEndTime - job.LRMSStartTime # Job finished and exitcode found job.message = MESSAGES[job.state] return # else add_failure(job) # Handle jobs known to LSF for line in handle.stdout[1:]: try: info = line.strip().split() assert(len(info) == 15) handle_job(info) except Exception as e: if line: warn('Failed to parse bjobs line: %s\n%s' % (line, str(e)), 'lsf.Scan') # Handle jobs lost in LSF if handle.returncode != 0: debug('Got error code %i from bjobs' % handle.returncode, 'lsf.Scan') debug('Error output is:\n' + ''.join(handle.stderr), 'lsf.Scan') lost_job = re.compile('Job <(\d+)> is not found') for line in handle.stderr: match = lost_job.match(line) if match: handle_job([match.groups()[0], None, 'UNKNOWN'], False) kicklist = [] for job in jobs.itervalues(): if hasattr(job, 'exitcode'): with open(job.lrms_done_file, 'w') as f: f.write('%d %s\n' % (job.exitcode, job.message)) write_comments(job) update_diag(job) kicklist.append(job) gm_kick(kicklist)
# Slurm can report StartTime and EndTime in at least these two formats: # 2010-02-15T15:30:29 (MDS) # 02/15-15:25:15 # Python does not support duplicate named groups. # Have to use separate regex if we want to use named groups. #date_MDS = re.compile(r'^(?P<YYYY>\d\d\d\d)-(?P<mm>\d\d)-(?P<dd>\d\d)T(?P<HH>\d\d):(?P<MM>\d\d):(?P<SS>\d\d)$') #date_2 = re.compile(r'^(?P<mm>\d\d)/(?P<dd>\d\d)-(?P<HH>\d\d):(?P<MM>\d\d):(?P<SS>\d\d)$') date_MDS = re.compile(r'^(?P<YYYY>\d\d\d\d)-(?P<mm>\d\d)-(?P<dd>\d\d) (?P<HH>\d\d):(?P<MM>\d\d):(?P<SS>\d\d)$') for line in handle.stdout: try: localid, state = line.strip().split(':', 1) except: if line: warn('Failed to parse squeue line: ' + line, 'slurm.Scan') continue job = jobs[localid] job.state = state if job.state in ['PENDING','RUNNING','SUSPENDED','COMPLETING']: continue if not job.state: set_exit_code_from_diag(job) job.message = MESSAGES.get(job.state, '') args = Config.slurm_bin_path + '/scontrol -o show job %s' % localid scontrol_handle = execute(args) if scontrol_handle.returncode != 0: debug('Got error code %i from scontrol' % scontrol_handle.returncode, 'slurm.Scan') debug('Error output is:\n' + ''.join(scontrol_handle.stderr), 'slurm.Scan')
def cluster(self, reads, save_input_path=None, output_dir=None, cached_output=None): import shutil def get_seq_obj(output): seq_mapping = dict([(x.id, x) for x in reads]) output_seqs = map(lambda x: [seq_mapping[y] for y in x], output) return output_seqs try: ## reads are a path self.reads = dict([(x.id, x) for x in SeqIO.parse(reads, 'fasta')]) except AttributeError as e: ## reads is a list of SeqRecord-like objects self.reads = dict([(x.id, x) for x in reads]) finally: if cached_output and self.distance_calculator.matrix: self.input_matrix, num_edges, mapping = self.convert_adjacency_matrix( self.distance_calculator.matrix) mapping = self.reverse_mappings(mapping, self.reads) return [ Cluster(x, cluster_id=i, clustering_tool=self) for i, x in enumerate( sorted(self.parse_dsf_output(cached_output, mapping), key=lambda x: len(x), reverse=True)) ] self.distance_calculator.generate_distances(reads) try: reads.close() except AttributeError as e: pass self.input_matrix, num_edges, mapping = self.convert_adjacency_matrix( self.distance_calculator.matrix) mapping = self.reverse_mappings(mapping, self.reads) log.debug('Number of edges in input graph:' + str(num_edges)) # write adjacency to file for dsf input def writer(matrix, num_edges): yield '{} {} 001\n'.format(len(matrix), num_edges) for neighbours in matrix: line = ' '.join([ ' '.join(map(str, (n + 1, w))) for n, w in sorted(neighbours, key=lambda x: x[0]) ]) yield '{}\n'.format(line) matrix_output_iterator = writer(self.input_matrix, num_edges) in_file = tempfile.NamedTemporaryFile(delete=True) try: if save_input_path: in_file = open(save_input_path, 'wb') except IOError as e: in_file = tempfile.NamedTemporaryFile(delete=True) log.warn( 'Provided DSF input matrix write path not valid, using temporary file' ) log.info('Saving dsf input file to'.format(in_file.name)) in_file.writelines(matrix_output_iterator) in_file.flush() # check provided output_dir is valid if output_dir: if not os.path.exists(output_dir): log.warn( 'Provied DSF output directory path not valid, using temporary directory' ) output_dir = None else: temp_dir = None # make temp output dir if no valid output dir provided if not output_dir: temp_dir = tempfile.mkdtemp() output_dir = temp_dir saved_umask = os.umask( 0077) # Ensure the file is read/write by the creator only # run DSF try: output = self.run(self.src, self.params, in_file.name, output_dir, mapping) # run dsf except Exception as e: # This is just so the temp files get deleted in the case some previous unhandled exception gets raised raise e finally: if temp_dir: os.umask(saved_umask) shutil.rmtree(temp_dir) in_file.close() ## generate instances of cluster_class.Cluster as result output = [ Cluster(x, cluster_id=i, clustering_tool=self) for i, x in enumerate( sorted(output, key=lambda x: len(x), reverse=True)) ] return output
def generate_distances(self, reads=None, minimap=None, filter_func=lambda x: True): ## Generates distance matrix of form {read_id } ## reads = [Bio.SeqIO, ...] = list of ORIENTED (ie no rev-compl) reads to be clustered (ie containing genes) ## if None uses self.reads ## minimap = instance of MinimapWrapper object. If none uses pre-set self.minimap ## filter_func = distance included in output if threshold(distance) = True import copy from Bio import SeqIO if self.filter_function: filter_func = self.filter_function if self.matrix: log.info('Using cached distance matrix') result = self.matrix if filter_func: result_filtered = self.filter_matrix( copy.deepcopy(self.matrix), filter_func) result = result_filtered return self.matrix if not minimap: minimap = self.minimap mapping = minimap.ava(reads=reads) result = {} mapped_reads = set( ) # for keeping track of mapped reads to report missing reads for i, line in enumerate(mapping): mapped_reads.add(line.qName) mapped_reads.add(line.tName) try: if 'NM' not in line.NM: raise IndexError NM = int(line.NM.split(':')[2]) except IndexError as e: log.error( 'Error in Minimap output: NM field is likely missing\nmapping line:{}' .format('\t'.join(line))) log.debug(dir(line)) log.debug(zip(line.header, line.attributes)) raise ValueError() distance_value = (NM + line.qStart + (line.qLength - line.qEnd) + NM + line.tStart + (line.tLength - line.tEnd)) / float(line.qLength + line.tLength) if line.qName not in result: result[line.qName] = {} result[line.qName][line.tName] = distance_value # check if any reads missing from mapping missing = set([x.id for x in reads]).difference(mapped_reads) if missing: log.warn('{} / {} reads missing from mapping'.format( len(missing), len(list(reads)))) log.debug('\n'.join(list(missing))) self.matrix = result if filter_func: result_filtered = self.filter_matrix(copy.deepcopy(result), filter_func) result = result_filtered return result
def run(args): # add lineage profiles/stats import re from ete2 import PhyloTree, NCBITaxa # dump tree by default if not args.tree and not args.info and not args.descendants: args.tree = True ncbi = NCBITaxa() all_taxids = {} all_names = set() queries = [] if not args.search: log.error('Search terms should be provided (i.e. --search) ') sys.exit(-1) for n in args.search: queries.append(n) try: all_taxids[int(n)] = None except ValueError: all_names.add(n.strip()) # translate names name2tax = ncbi.get_name_translator(all_names) all_taxids.update([(v, None) for v in name2tax.values()]) not_found_names = all_names - set(name2tax.keys()) if args.fuzzy and not_found_names: log.warn("%s unknown names", len(not_found_names)) for name in not_found_names: # enable extension loading tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy) if tax: all_taxids[tax] = None name2tax[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" %sim if not_found_names: log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names)) if args.tree: if len(all_taxids) == 1: target_taxid = all_taxids.keys()[0] log.info("Dumping NCBI descendants tree for %s" %(target_taxid)) t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True) else: log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids))) t = ncbi.get_topology(all_taxids.keys(), intermediate_nodes=args.full_lineage, rank_limit=args.rank_limit, collapse_subspecies=args.collapse_subspecies) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_lineage(n.taxid) n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage))) dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage"]) elif args.descendants: log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids))) print '# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"]) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid in all_taxids: descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit) print '\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''), '|'.join(map(str, descendants)), '|'.join(map(str, ncbi.translate_to_names(descendants)))]) elif args.info: print '# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"]) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid, name in translator.iteritems(): lineage = ncbi.get_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage_string = ','.join(map(str, lineage)) print '\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string])
def store_rates(rates, product): """ Rates must be a list of ticks with the schema [unix time, low, high, open, close, volume] """ if len(rates) == 0: log.warn('no rates to store') return log.info('storing {} HISTORIC RATES'.format(len(rates))) log.info('first rate date: ' + datetime.utcfromtimestamp(rates[-1][0]).isoformat()) log.info('last rate date: ' + datetime.utcfromtimestamp(rates[0][0]).isoformat()) db_params = dict( dbname=dbconfig['db_name'], user=dbconfig['db_user'], host=dbconfig['host'], port=dbconfig['port'], ) log.info('connecting to SQL DB') log.info('params:') log.info(db_params) conn = psycopg2.connect(**db_params) conn.set_session(autocommit=True) log.info('connected to DB.') cur = conn.cursor() log.info('creating database {} (if necessary).'.format( dbconfig['db_name'])) cur.execute('CREATE DATABASE IF NOT EXISTS {}'.format(dbconfig['db_name'])) n_cols = 7 log.info('creating table {} (if necessary).'.format(_rates_tbl)) cur.execute('''CREATE TABLE IF NOT EXISTS {} ( product STRING, timestamp INT, low DECIMAL, high DECIMAL, open DECIMAL, close DECIMAL, volume DECIMAL, PRIMARY KEY (product, timestamp) )'''.format(_rates_tbl)) # List of rate values rate_vals = [[product] + rate for rate in rates] assert (len(rate_vals[0]) % 7 == 0) # [:-1] is to remove the last comma sql_tuple_str = '(' + ('%s,' * n_cols)[:-1] + '),' sql_batch_str = (sql_tuple_str * _batch_sz)[:-1] log.info('inserting rates into DB...') while len(rate_vals) > 0: sql_vals_str = sql_batch_str if len(rate_vals) < _batch_sz: sql_vals_str = (sql_tuple_str * len(rate_vals))[:-1] batch = rate_vals[:_batch_sz] flatten_vals = [val for rate in batch for val in rate] cur.execute('UPSERT INTO {} VALUES '.format(_rates_tbl) + sql_vals_str, flatten_vals) log.info('upserted {} rates.'.format(len(batch))) rate_vals = rate_vals[_batch_sz:] log.info('inserting {} HISTORIC RATES complete.'.format(len(rates))) cur.close() conn.close()
log.err("Output type (-ot) must be either 'forest', 'derivation', or 'derived'.") sys.exit(1) if not args.weight_type in ['prob', 'logprob']: log.err("Weight type (-m) must be either 'prob'or 'logprob'.") sys.exit(1) logprob = (args.weight_type == 'logprob') if args.output_type == "forest": if not args.output_file: log.err("Need to provide '-o FILE_PREFIX' with output type 'forest'.") sys.exit(1) if args.k: log.warn("Ignoring -k command line option because output type is 'forest'.") if not args.parser in ['td', 'basic']: log.err("Parser (-p) must be either 'td' or 'basic'.") sys.exit(1) if args.parser != 'td' and args.boundary_nodes: log.warn('The -bn option is only relevant for the tree decomposition parser ("-p td").') if args.k > config.maxk: log.err("k must be <= than %i (defined in in args.py)." % args.maxk) sys.exit(1) if args.verbose < 0 or args.verbose > 4: log.err("Invalid verbosity level, must be 0-4.") sys.exit(1)
def Scan(config, ctr_dirs): """ Query the SLURM host for all jobs in /[controldir]/processing with ``squeue``. If the job has stopped running, more detailed information is fetched with ``scontrol``, and the diagnostics and comments files are updated. Finally ``gm-kick`` is executed on all jobs with an exit code. :param str config: path to arc.conf :param ctr_dirs: list of paths to control directories :type ctr_dirs: :py:obj:`list` [ :py:obj:`str` ... ] """ configure(config, set_slurm) if Config.scanscriptlog: scanlogfile = arc.common.LogFile(Config.scanscriptlog) arc.common.Logger_getRootLogger().addDestination(scanlogfile) arc.common.Logger_getRootLogger().setThreshold(Config.log_threshold) jobs = get_jobs(ctr_dirs) if not jobs: return if Config.remote_host: # NOTE: Assuming 256 B of TCP window needed for each job (squeue) ssh_connect(Config.remote_host, Config.remote_user, Config.private_key, (2 << 7) * len(jobs)) execute = execute_local if not Config.remote_host else execute_remote args = Config.slurm_bin_path + "/squeue -a -h -o %i:%T -t all -j " + ",".join(jobs.keys()) if os.environ.has_key("__SLURM_TEST"): handle = execute(args, env=dict(os.environ)) else: handle = execute(args) if handle.returncode != 0: debug("Got error code %i from squeue" % handle.returncode, "slurm.Scan") debug("Error output is:\n" + "".join(handle.stderr), "slurm.Scan") # Slurm can report StartTime and EndTime in at least these two formats: # 2010-02-15T15:30:29 (MDS) # 02/15-15:25:15 # Python does not support duplicate named groups. # Have to use separate regex if we want to use named groups. date_MDS = re.compile(r"^(?P<YYYY>\d\d\d\d)-(?P<mm>\d\d)-(?P<dd>\d\d)T(?P<HH>\d\d):(?P<MM>\d\d):(?P<SS>\d\d)$") date_2 = re.compile(r"^(?P<mm>\d\d)/(?P<dd>\d\d)-(?P<HH>\d\d):(?P<MM>\d\d):(?P<SS>\d\d)$") for line in handle.stdout: try: localid, state = line.strip().split(":", 1) except: if line: warn("Failed to parse squeue line: " + line, "slurm.Scan") continue job = jobs[localid] job.state = state if job.state in RUNNING: continue if not job.state: set_exit_code_from_diag(job) job.message = MESSAGES.get(job.state, "") args = Config.slurm_bin_path + "/scontrol -o show job %s" % localid scontrol_handle = execute(args) if scontrol_handle.returncode != 0: debug("Got error code %i from scontrol" % scontrol_handle.returncode, "slurm.Scan") debug("Error output is:\n" + "".join(scontrol_handle.stderr), "slurm.Scan") try: scontrol_dict = dict(item.split("=", 1) for item in re.split(" (?=[^ =]+=)", scontrol_handle.stdout[0])) job = jobs[scontrol_dict["JobId"]] except: warn("Failed to parse scontrol line: " + line, "slurm.Scan") continue if "ExitCode" in scontrol_dict: ec1, ec2 = scontrol_dict["ExitCode"].split(":") job.exitcode = int(ec2) + 256 if int(ec2) != 0 else int(ec1) else: job.exitcode = 0 if state == "COMPLETED" else -1 if (state == "NODE_FAIL" or state == "CANCELLED") and ("ExitCode" not in scontrol_dict or job.exitcode == 0): job.exitcode = 15 job.message = "Job was cancelled by SLURM" if "StartTime" in scontrol_dict: match = date_MDS.match(scontrol_dict["StartTime"]) or date_2.match(scontrol_dict["StartTime"]) scontrol_dict["StartTime"] = get_MDS(match.groupdict()) job.LRMSStartTime = arc.common.Time(scontrol_dict["StartTime"]) if "EndTime" in scontrol_dict: match = date_MDS.match(scontrol_dict["EndTime"]) or date_2.match(scontrol_dict["EndTime"]) scontrol_dict["EndTime"] = get_MDS(match.groupdict()) job.LRMSEndTime = arc.common.Time(scontrol_dict["EndTime"]) if "StartTime" in scontrol_dict and "EndTime" in scontrol_dict: job.WallTime = job.LRMSEndTime - job.LRMSStartTime if "NumCPUs" in scontrol_dict: job.Processors = scontrol_dict["NumCPUs"] with open(job.lrms_done_file, "w") as f: f.write("%d %s\n" % (job.exitcode, job.message)) write_comments(job) update_diag(job) kicklist = [job for job in jobs.itervalues() if job.state not in RUNNING] kicklist.extend([job for job in jobs.itervalues() if job.state == "CANCELLED"]) # kick twice gm_kick(kicklist)
def run(args): # add lineage profiles/stats import re from ete2 import PhyloTree, NCBITaxa if not args.taxonomy and not args.info: args.taxonomy = True ncbi = NCBITaxa() all_taxids = {} all_names = set() queries = [] if not args.search: log.error('Search terms should be provided (i.e. --search) ') sys.exit(-1) for n in args.search: queries.append(n) try: all_taxids[int(n)] = None except ValueError: all_names.add(n.strip()) # translate names name2tax = ncbi.get_name_translator(all_names) all_taxids.update([(v, None) for v in name2tax.values()]) not_found_names = all_names - set(name2tax.keys()) if args.fuzzy and not_found_names: log.warn("%s unknown names", len(not_found_names)) for name in not_found_names: # enable extension loading tax, realname, sim = ncbi.get_fuzzy_name_translation( name, args.fuzzy) if tax: all_taxids[tax] = None name2tax[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" % sim if args.taxonomy: log.info("Dumping NCBI taxonomy of %d taxa..." % (len(all_taxids))) t = ncbi.get_topology(all_taxids.keys(), intermediate_nodes=args.full_lineage, rank_limit=args.rank_limit, collapse_subspecies=args.collapse_subspecies) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) n.name = "%s - %s" % (id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_lineage(n.taxid) n.add_features( named_lineage='|'.join(ncbi.translate_to_names(lineage))) dump(t, features=[ "taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage" ]) elif args.info: print '# ' + '\t'.join( ["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"]) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid, name in translator.iteritems(): lineage = ncbi.get_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage_string = ','.join(map(str, lineage)) print '\t'.join([ str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string ])
def get_data(force, gene, pseudogene, reverse_complement, parser, fix_karolinska, genome_range, gene_ids, coordinate, patch, post_process, functional_exceptions, unique_regions, max_cn, custom_url=None): def sf(x): y = re.split(r'(\d+)', x[len(gene):]) return int(y[1]), y[2] # Get Karolinska's data cypdata = karolinska.get_karolinska_database(gene, parser, force, custom_url) if fix_karolinska is not None: fix_karolinska(cypdata) #pprint (cypdata) # Get NCBI data for genes and reference genome genes, hg19 = ncbi.get_genomes(gene_ids[0], genome_range, gene_ids[1:], force=force, reverse_complement=reverse_complement) new_seq = genes[gene].seq.tomutable() for c, n in patch: new_seq[coordinate(c, genes[gene])] = n genes[gene] = genes[gene]._replace(seq=new_seq.toseq()) # Fix Karolinska's coordinates result = merger.merge(cypdata, genes[gene], coordinate, functional_exceptions, reverse_complement) ## pprint(genes['CYP21'].translation) ## pprint(genes['CYP21P'].translation) mx = collections.defaultdict(lambda: ['', []]) for a in result: for m in result[a]['mutations']: mx[(m['pos'], m['op'])][0] = m mx[(m['pos'], m['op'])][1].append(a) for m in genes[gene].pseudo_mutations.values(): m['functional'] = merger.is_functional( genes[gene], m, genes[gene].pseudo_mutations.values(), True) # if (m['pos'], m['op']) in mx: # log.warn('[{}] {} (from {}) originates from {}', # ' F'[mx[(m['pos'], m['op'])][0]['functional']], # mx[(m['pos'], m['op'])][0]['old'], # ','.join(set(mx[(m['pos'], m['op'])][1])), # m['old'] # ) # Remove mutations not present in hg19 and fix the coordinates for a in result: for m in result[a]['mutations']: if m['pos'] == 'pseudogene': continue if m['pos'] not in genes[gene].translation: log.warn('Main: Translation not found for {}: {} ({})', a, m['old'], m['pos']) m['pos'] = None else: m['pos'] = genes[gene].translation[m['pos']] result[a]['mutations'] = [ m for m in result[a]['mutations'] if not m['pos'] is None ] # Fetch missing dbSNP links result = dbsnp.get_dbsnp(result, genome_range, force) # Fix exon and intron coordinates for _, g in genes.iteritems(): g.exons[:] = map( lambda x: (g.translation[int(x.start)], g.translation[int(x.end)]), g.exons) g.introns[:] = map( lambda x: (g.translation[int(x.start)], g.translation[int(x.end)]), g.introns) # Patch hg19 with reference SNPs hg19 = list(hg19) for gi, hi in genes[gene].translation.iteritems(): if hg19[hi - genome_range[1]] != genes[gene].seq[gi]: hg19[hi - genome_range[1]] = genes[gene].seq[gi] hg19 = ''.join(hg19) result.update({ gene + '*1': { 'mutations': [], 'phenotype': { 'invivo': 'Normal', 'invitro': 'Normal' } } }) # Add missing regions post_process(genes, result) hoi = collections.OrderedDict() for pos, m in genes[gene].pseudo_translation.iteritems(): hoi[genes[gene].translation[pos]] = NoIndent( (genes[pseudogene].translation[m['old_pos']], m['op'] if 'op' in m else '')) return dict( #map=hoi, seq=hg19, region=NoIndent(genome_range), name=gene, exons={ '{}'.format(ei + 1): NoIndent(e) for ei, e in enumerate(genes[gene].exons) }, special_regions={ g: NoIndent(gg) for g, gg in genes[gene].special_regions.iteritems() }, pseudogenes={ g: { 'exons': { '{}'.format(ei + 1): NoIndent(e) for ei, e in enumerate(genes[g].exons) }, 'special_regions': { g: NoIndent(gg) for g, gg in genes[g].special_regions.iteritems() } } for g in [pseudogene] } if pseudogene is not None else {}, # Regions used for CNV detection of each gene unique_regions=NoIndent(unique_regions), # Unique CYP2D8 region used for CNV detection # Based on BLAT, that is [5e-4i-4e] cnv_region=NoIndent(('chr22', 42547463, 42548249)), alleles=OrderedDict([(a, { 'phenotype': NoIndent(result[a]['phenotype']), 'mutations': [ NoIndent( OrderedDict([(x, y[x]) for x in sorted(y, reverse=True)])) for y in result[a]['mutations'] ] }) for a in sorted(result, key=sf)]), max_cn=max_cn)
sys.exit(1) if not args.weight_type in ['prob', 'logprob']: log.err("Weight type (-m) must be either 'prob'or 'logprob'.") sys.exit(1) logprob = (args.weight_type == 'logprob') if args.output_type == "forest": if not args.output_file: log.err( "Need to provide '-o FILE_PREFIX' with output type 'forest'.") sys.exit(1) if args.k: log.warn( "Ignoring -k command line option because output type is 'forest'." ) if not args.parser in ['td', 'basic']: log.err("Parser (-p) must be either 'td' or 'basic'.") sys.exit(1) if args.parser != 'td' and args.boundary_nodes: log.warn( 'The -bn option is only relevant for the tree decomposition parser ("-p td").' ) if args.k > config.maxk: log.err("k must be <= than %i (defined in in args.py)." % args.maxk) sys.exit(1)
allele_db='../database/V-QUEST-reference-allele-db+no-period-references.clustalw.no-gaps.fasta', num_mappings_to_save=5, skip_extraction=False): mapper.params = '-cx map-pb -k10 -w3 -N{}'.format(num_mappings_to_save - 1) try: mappings = mapper.run(reads_path, allele_db) reads = dict([(x.id, x) for x in SeqIO.parse(reads_path, 'fasta')]) except IOError, ValueError: log.error('Reads file does not exist or is invalid') raise ValueError unmapped = set(reads.keys()).difference([x.qName for x in mappings]) if len(unmapped) > 0: log.warn( '{} reads had no allele mapping, they will be removed\n'.format( len(unmapped), '\n'.join(list(unmapped)))) log.debug('Read ids to be removed:\n' + '\n'.join(unmapped)) reads = dict([(k, v) for k, v in reads.iteritems() if k not in unmapped]) log.info('Loaded {} reads'.format(len(reads))) if not skip_extraction: reads, mappings = get_read_segments(reads, mappings) modified_seqs = [] for m in mappings: try: reads[m.qName].mapping.append(m) except AttributeError: reads[m.qName].mapping = [m]