def main(locusID, f): POG_name, POG_link = get_POG_link(locusID) POGs = get_POGs( POG_link ) for pog in POGs: url = baseurl + 'getAnnotation.do?locusID=' + pog for x in TableParse.parse( urllib2.urlopen( url ).read() ): if len(x) == 0: continue if x[0] == 'Strain': strain = x[1][ : x[1].find('(')].strip().split() key = (strain[1],strain[2]) if key in tax_to_acc: acc = tax_to_acc[key] else: acc = raw_input("Give the accession for {0}:".format(key)) tax_to_acc[key] = acc print >> sys.stderr, "you typed:", acc elif x[0] == 'Genomic location': m = genomic_location_rex.match( x[1].replace(' ','') ) start,end,strand = int(m.group(1)), int(m.group(2)), m.group(3) strand = '+' if strand.find('+') >= 0 else '-' break strain = 'P.' + strain[1] + strain[2] locs,seq = getUpstream500_from_intergenic(strain+'.intergenic', pog, strand, start, end) if seq is None: print >> sys.stderr, "something wacky happened to seq retrieving for {0}({1}) from file {2}!!!".format(\ pog, strand, strain+'.intergenic') else: f.write(">" + strain + '_' + acc) f.write("_" + ",".join(locs)) f.write("_" + POG_name + "_" + pog + "\n") f.write("{0}\n".format(seq)) f.flush()
def get_POGs(link): POGs = [] url = baseurl + link x = urllib2.urlopen( url ).read() for p in TableParse.parse( x ): if len(p) > 1 and p[0] == '-->': POGs.append( p[1] ) return POGs
def parseOriginalStyle(): cells = TableParse.parse(result) log.debug(pprint.pformat(list(enumerate(cells)))) tableNum = 35 scheduled = todayDateTime((cells[tableNum][2] or cells[tableNum][3]).splitlines()[0]) actualCell = (cells[tableNum][4] or cells[tableNum][5]) if actualCell: actual = todayDateTime(actualCell.splitlines()[0].replace('(','').replace(')','')) isEstimate = 'estimated' in actualCell else: actual = None isEstimate = False note = cells[tableNum][6] return scheduled, actual, isEstimate, note
def make_ID_for_UPstream500Seq(locusID, tax_to_acc): url = baseurl + 'getAnnotation.do?locusID=' + locusID for x in TableParse.parse( urllib2.urlopen( url ).read() ): if len(x) > 1: if x[0]=='Strain': strain = x[1] if strain in tax_to_acc: acc = tax_to_acc[strain] else: acc = raw_input("Give the accession for {0}:".format(strain)) tax_to_acc[strain] = acc print >> sys.stderr, "you typed:", acc elif x[0]=='Genomic location': m = genomic_location_rex.match( x[1].replace(' ','') ) start,end,strand = int(m.group(1)), int(m.group(2)), m.group(3) print >> sys.stderr, "strand is ", strand if strand.find('+') >= 0: loc = "{0}/{1}-{2}".format(acc, start-500, start-1) else: loc = "{0}/{1}-{2}".format(acc, end+499, end+1) return loc
def get_Upstream500Seq(locusID): url = baseurl + 'getAnnotation.do?locusID=' + locusID for x in TableParse.parse( urllib2.urlopen( url ).read() ): if len(x) > 1 and x[0]=='Upstream 500 BP Region': return x[1][ : x[1].find('\n')].replace(' ','')
def html2lol(html): import TableParse lol = TableParse.parse(html) if verbose: print "Parsed return", str(type(lol)), 'of length', len(lol) return lol
def main(opt): """ Perform a bunch of queries and processing to build something like the SOT MP schedule page, only with automated info about which loads ran. """ outdir = opt.outdir if not os.path.exists(outdir): os.makedirs(outdir) nav = web_nav if opt.fileurls: nav = file_nav # fetch all of the loads that ran loads = sqlaca.fetchall("""select * from planned_run_loads order by datestart""") # get all of the short term schedules from MP short_term_top = glob('%s/cycle*/????????.html' % mp_sched_path) short_terms = [] for st_path in short_term_top: st_match = re.search('cycle(\d+)\/(\w{3}\d{4}\w).html', st_path) short_terms.append((int(st_match.group(1)), st_match.group(2))) short_terms = np.rec.fromrecords(short_terms, names=['cycle', 'label']) # fetch all mp comments from their schedule pages schedule_files = [] for cycle in range(3, max(short_terms['cycle'])): schedule_files.append('%s/schedules_ao%s.html' % ('/proj/web-icxc/htdocs/mp/html', cycle)) schedule_files.append('/proj/web-icxc/htdocs/mp/html/schedules.html') comments = [] for sched in schedule_files: sot_page = open(sched).read() table = TableParse.parse(sot_page) table = [x for x in table if len(x) > 0] table = [x for x in table if table[1] != ''] last_sched = '' for line in table: if line[0] == '': line[0] = last_sched if line[0] != last_sched: last_sched = line[0] sot_table = np.rec.fromrecords(table[1:], names=table[0]) for comment_week in sot_table[sot_table['Comment'] != '']: comments.append([comment_week['Week'], comment_week['Version'], comment_week['Comment']]) mp_comments = np.rec.fromrecords(comments, names=['week', 'version', 'comment']) # everything that was planned planning = sqlaca.fetchall("""select * from tl_processing where processing_tstart > '2002:007:13:35:00.000' order by sumfile_modtime""") sched_keys = ['sumfile_modtime', 'dir', 'doprint', 'sosa', 'color', 'runstopcolor', 'label', 'name', 'version', 'sortday', 'cycle', 'st_link', 'planned_start', 'planned_stop', 'actual_cmd_start', 'actual_cmd_stop', 'comment', 'mp_comment'] def_sched = dict( sumfile_modtime=None, dir=None, doprint=None, sosa=None, color='grey', runstopcolor='black', label=None, name=None, version=None, sortday=None, cycle=None, st_link=None, planned_start=None, planned_stop=None, actual_cmd_start=None, actual_cmd_stop=None, comment=' ', mp_comment=' ') # for each planned week, figure out if it ran or not, and either way, # push a dictionary for it to the master list schedule = [] for week in planning: sched = def_sched.copy() comments = [] sched['sumfile_modtime'] = week['sumfile_modtime'] sched['dir'] = week['dir'] sched['planned_start'] = week['planning_tstart'] if sched['planned_start'] > '2011:335': sched['sosa'] = 1 sched['planned_stop'] = week['planning_tstop'] if week['replan'] == 1: comments.append('replan/re-open') labelmatch = re.search('\/\d{4}\/(\w{3}\d{4})\/ofls(\w?)\/', week['dir']) if not labelmatch: raise ValueError("could not parse %s" % week['dir']) sched['label'] = "%s%s" % (labelmatch.group(1), labelmatch.group(2).upper()) sched['version'] = labelmatch.group(2).upper() sched['name'] = labelmatch.group(1) sched_time = time.strptime(sched['name'], '%b%d%y') sched['sortday'] = time.strftime('%Y%j', sched_time) mp_comment_match = mp_comments[ (mp_comments['week'] == sched['name']) & (mp_comments['version'] == sched['version'])] if len(mp_comment_match): sched['mp_comment'] = mp_comment_match[0]['comment'] if sched['label'] in short_terms['label']: st = short_terms[short_terms['label'] == sched['label']][0] sched['cycle'] = int(st['cycle']) cycle_path = os.path.join(mp_sched_path, 'cycle%d' % sched['cycle'], '%s.html' % sched['label']) cycle_url = (nav['mp_sched_url'] + "/cycle%d/" % sched['cycle'] + "%s.html" % sched['label']) if os.path.exists(cycle_path): sched['st_link'] = cycle_url else: for x in cycle_table: if ((week['processing_tstart'] > cycle_table[x][0]) and (week['processing_tstart'] < cycle_table[x][1])): sched['cycle'] = int(x) break # if the week flew if week['dir'] in loads['dir']: match_loads = loads[loads['dir'] == week['dir']] match_loads = np.sort(match_loads, order='datestart') sched['actual_cmd_start'] = min(match_loads['datestart']) sched['actual_cmd_stop'] = max(match_loads['datestop']) sched['color'] = 'black' load = match_loads[0] all_week_loads = sqlaca.fetchall( """select * from tl_built_loads where file = '%s' and sumfile_modtime = %f order by load_segment""" % (load['file'], load['sumfile_modtime'])) # does the run stop time match the plan? last_run_cmd_time = max(match_loads['datestop']) last_planned_cmd_time = max(all_week_loads['last_cmd_time']) observing_int_date = None vehicle_int_date = None if load['datestart'] > '2011:335': science_loads = match_loads[[match_loads['load_scs'] > 130]] if len(science_loads): science_cmd_stop = max(science_loads['datestop']) if science_cmd_stop < last_run_cmd_time: observing_int_date = science_cmd_stop if not last_run_cmd_time == last_planned_cmd_time: vehicle_int_date = last_run_cmd_time if (observing_int_date and observing_int_date != vehicle_int_date): comments.append('observing-only int. at %s' % observing_int_date) sched['runstopcolor'] = 'darkgreen' if vehicle_int_date: comments.append('full int. at %s' % vehicle_int_date) sched['runstopcolor'] = 'darkred' if len(comments): sched['comment'] = ', '.join(comments) schedule.append([sched[x] for x in sched_keys]) # make records schedule = np.rec.fromrecords(schedule, names=sched_keys) # hack to mark rows to print week name only once per week schedule = np.sort(schedule, order='sortday') schedule['doprint'][schedule['name'][1:] != schedule['name'][0:-1]] = True schedule['doprint'][-1] = True # sort reverse by day schedule = schedule[::-1] # make html TASK_TEMPLATES = os.path.join(os.environ['SKA'], 'share', 'schedule_view', 'templates') jinja_env = jinja2.Environment( loader=jinja2.FileSystemLoader(TASK_TEMPLATES)) # loop to make cycle-specific pages cycle_labels = [] template = jinja_env.get_template('schedule.html') for cycle in np.unique(schedule['cycle']): page = template.render(nav=nav, schedule=schedule[schedule['cycle'] == cycle]) f = open(os.path.join(outdir, 'schedule_%s.html' % cycle), 'w') f.write(page) f.close() # some ugliness to get the 8 characters (YYYY:DOY) of the min and max dstart = min(schedule[schedule['cycle'] == cycle]['planned_start']) dstop = max(schedule[schedule['cycle'] == cycle]['planned_stop']) daymin = dstart[0:8] daymax = dstop[0:8] cycle_labels.append(dict(cycle=cycle, start=daymin, stop=daymax, file='schedule_%s.html' % cycle)) # then make one big page page = template.render(nav=nav, schedule=schedule) f = open(os.path.join(outdir, 'schedules_all.html' % cycle), 'w') f.write(page) f.close() # and make the most recent one again as a top page template = jinja_env.get_template('master_schedule.html') maxcycle = np.max(schedule['cycle']) page = template.render(nav=nav, schedule=schedule[schedule['cycle'] == maxcycle], cycles=cycle_labels) f = open(os.path.join(outdir, 'schedule.html'), 'w') f.write(page) f.close()