def __init__(self, url_list, brand_names=[]): for i in url_list: self.urls.append(check_url(i)) for i in brand_names: self.brand_names.append(i) user_agent = 'Mozilla/6.0 (compatible; MSIE 5.5; Windows NT)' self.headers = { 'User-Agent' : user_agent } self.log = get_file_logger('ju_report')
def __init__(self, url_list, brand_names=[]): for i in url_list: self.urls.append(check_url(i)) for i in brand_names: self.brand_names.append(i) user_agent = 'Mozilla/6.0 (compatible; MSIE 5.5; Windows NT)' self.headers = {'User-Agent': user_agent} self.log = get_file_logger('ju_report')
def main(): log = get_file_logger('ju_item_insert') hostaddress = 'localhost' database = 'ju_db' user = MYSQL_USER password = MYSQL_PASSWORD db = torndb.Connection(hostaddress, database, user, password) excel_handler = DealExcel(INPUT_FILE_PATH, 'Sheet1') ju_brands = excel_handler.read_column_excel(1, 2) ju_urls = excel_handler.read_column_excel(2, 2) ju_pages = GetPageData(ju_urls, ju_brands) result = [] for i, j in zip(ju_urls, ju_brands): result.append(ju_pages.get_page(i, j)) # 在页面中抓取出floors floors = [] for index, item in enumerate(result): floors.append(GetJuFloor(item['data'], item['title']).get_floors()) values = [] time_start = time.strftime('%Y-%m-%d-%H:%M:%S',time.localtime(time.time())) for item in floors: row_big_item = [] row_small_item = [] row_big_item = GetJuItem(item.get('big'), item['brand_name']).get_big_items() values.extend(row_big_item) small_pages = GetPageData(item['small'].get('urls'), item['brand_name']).get_pages() for i in small_pages: row_small_item.extend(GetJuItem(i, item['brand_name']).get_small_items()) values.extend(row_small_item) sql_item = "INSERT INTO ju_brand_item (id, name, description, created, price, orig_price, started, item_type, brand_name) \ VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)" sql_num = "INSERT INTO ju_brand_item_num (item_id, sold_num) VALUES (%s, %s)" sql_item_detail = "INSERT INTO ju_brand_item_detail (item_id, img_src, detail_src) VALUES (%s, %s, %s)" db_item = [] db_item_num = [] db_item_detail = [] for value in values: try: if value['str_people'] == u'\u4eba\u5df2\u4e70\n': is_started = True else: is_started = False item_id = string.atoi(re.findall(r'\d+', value['src_detail'])[0]) db_item.append([item_id, value['name'], value['desc'], value['date_time'], value['price'], value['orig_price'], is_started, value['item_type'], value['brand_name']]) db_item_num.append([item_id, value['sold_num']]) db_item_detail.append([item_id, value['img_src'], value['src_detail']]) except Exception, e: log.error(str(value['name'])) continue
def set_logger(self): created_dir = False if not os.path.isdir(self.log_dir): try: os.makedirs(self.log_dir) created_dir = True except Exception: raise AttributeError( "Failed to create log output directory %s" % self.log_dir) self.logger = get_file_logger( name=self.title, filestem=os.path.join(self.log_dir, '%s.%s.log' % (self.title, self.now_str))) if created_dir: self.logger.info("Created log output directory %s.", self.log_dir)
class GetPageData(object): ''' 输入url抓取网页信息,并保存unicode格式的文本信息 ''' urls = [] brand_names = [] log = get_file_logger('get_page') def __init__(self, url_list, brand_names=[]): for i in url_list: self.urls.append(check_url(i)) for i in brand_names: self.brand_names.append(i) user_agent = 'Mozilla/6.0 (compatible; MSIE 5.5; Windows NT)' self.headers = {'User-Agent': user_agent} def get_page(self, url='', page_title=u'', decode_str='gbk'): """ 将url对应的界面数据以及名称打包成结果返回 """ try: page = '' if not url.strip(): raise Exception('url is None') req = urllib2.Request(url, headers=self.headers) response = urllib2.urlopen(req) html = response.read() page = html.decode(decode_str) except Exception, e: self.log.error(e) return None result = { 'data': page, 'title': page_title, } return result
def __init__(self, page, brand_name): self.unicode_page = check_page(page) self.log = get_file_logger('ju_report') self.brand_name = brand_name
def __init__(self, file_name, sheet_name): self.file_name = file_name self.sheet_name = sheet_name self.log = get_file_logger('excel_handler')
def main(): log = get_file_logger('ju_item_insert') hostaddress = 'localhost' database = 'ju_db' user = MYSQL_USER password = MYSQL_PASSWORD db = torndb.Connection(hostaddress, database, user, password) excel_handler = DealExcel(INPUT_FILE_PATH, 'Sheet1') ju_brands = excel_handler.read_column_excel(1, 2) ju_urls = excel_handler.read_column_excel(2, 2) ju_pages = GetPageData(ju_urls, ju_brands) result = [] for i, j in zip(ju_urls, ju_brands): result.append(ju_pages.get_page(i, j)) # 在页面中抓取出floors floors = [] for index, item in enumerate(result): floors.append(GetJuFloor(item['data'], item['title']).get_floors()) values = [] time_start = time.strftime('%Y-%m-%d-%H:%M:%S', time.localtime(time.time())) for item in floors: row_big_item = [] row_small_item = [] row_big_item = GetJuItem(item.get('big'), item['brand_name']).get_big_items() values.extend(row_big_item) small_pages = GetPageData(item['small'].get('urls'), item['brand_name']).get_pages() for i in small_pages: row_small_item.extend( GetJuItem(i, item['brand_name']).get_small_items()) values.extend(row_small_item) sql_item = "INSERT INTO ju_brand_item (id, name, description, created, price, orig_price, started, item_type, brand_name) \ VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)" sql_num = "INSERT INTO ju_brand_item_num (item_id, sold_num) VALUES (%s, %s)" sql_item_detail = "INSERT INTO ju_brand_item_detail (item_id, img_src, detail_src) VALUES (%s, %s, %s)" db_item = [] db_item_num = [] db_item_detail = [] for value in values: try: if value['str_people'] == u'\u4eba\u5df2\u4e70\n': is_started = True else: is_started = False item_id = string.atoi(re.findall(r'\d+', value['src_detail'])[0]) db_item.append([ item_id, value['name'], value['desc'], value['date_time'], value['price'], value['orig_price'], is_started, value['item_type'], value['brand_name'] ]) db_item_num.append([item_id, value['sold_num']]) db_item_detail.append( [item_id, value['img_src'], value['src_detail']]) except Exception, e: log.error(str(value['name'])) continue
def set_logger(self): self.logger = get_file_logger( name=self.title, filestem=os.path.join(self.log_dir, '%s.%s.log' % (self.title, self.now_str)))
def __init__(self, floor_data, brand_name): self.log = get_file_logger('ju_report') self.floor_data = floor_data self.brand_name = brand_name
def get_methylation_gene_expression_data(case_ids): outdir = unique_output_dir("gdc-nih_gene_expr", reuse_empty=True) logger = get_file_logger("nih_methylation_gene_counts", os.path.join(outdir, "getter.log")) # get relevant files for download qry_case = { "op": "in", "content": { "field": "cases.case_id", "value": list(case_ids) } } qry = { "filters": and_query(qry_primary, qry_case, or_query(qry_trans, qry_meth450)), "format": "json", "fields": ','.join(FILE_FIELDS), "size": 10000 } response = requests.post(FILES_ENDPOINT, json=qry) if response.status_code != 200: logger.error("Initial query failed: %s", response.content) raise ValueError("Query failed") res = response.json()['data']['hits'] meta = {} num_error = 0 num_files = 0 for r in res: if len(r['cases']) > 1: logger.error("File with ID %s has multiple case ID matches", r['file_id']) cid = r['cases'][0]['case_id'] fid = r['file_id'] fname = r['file_name'] meta.setdefault(cid, {}) if r['data_type'] == 'Gene Expression Quantification': if 'FPKM-UQ' in fname: continue elif 'FPKM' in fname: meta[cid]['fpkm'] = r outfn = os.path.join(outdir, cid, 'fpkm.gz') elif 'htseq.counts' in fname: meta[cid]['counts'] = r outfn = os.path.join(outdir, cid, 'counts.gz') elif r['data_type'] == 'Methylation Beta Value': meta[cid]['methylation'] = r outfn = os.path.join(outdir, cid, 'methylation.txt') try: download_data(fid, outfn) except Exception: logger.exception("Failed to download %s for case id %s", fname, cid) num_error += 1 else: logger.info("Downloaded case ID %s file ID %s to %s", cid, fid, outfn) num_files += 1 logger.info("Downloaded %d files. Encountered %d errors.", num_files, num_error) # run back through and write meta files num_meta = 0 num_meta_errors = 0 # write meta files for cid, d in meta.iteritems(): meta_fn = os.path.join(outdir, cid, 'meta.json') if os.path.exists(meta_fn): logger.error("Meta file already exists: %s", meta_fn) num_meta_errors += 1 else: with open(meta_fn, 'wb') as f: json.dump(d, f) num_meta += 1 logger.info("Create %d meta files. Encountered %d errors.", num_meta, num_meta_errors)
def get_legacy_idat(case_ids): outdir = unique_output_dir("gdc-nih_methylation", reuse_empty=True) logger = get_file_logger("legacy_idats", os.path.join(outdir, "getter.log")) qry_case = in_query("cases.case_id", case_ids) qry_idat = equal_query("files.data_format", "idat") qry = { "filters": and_query(qry_primary, qry_case, qry_idat, qry_meth450), "format": "json", "fields": ','.join(FILE_FIELDS), "size": 10000 } response = requests.post(LEGACY_FILES_ENDPOINT, json=qry) if response.status_code != 200: logger.error("Initial query failed: %s", response.content) raise ValueError("Query failed") res = response.json()['data']['hits'] logger.info("Found %d idat files.", len(res)) num_error = 0 num_files = 0 # we need to keep track of the files in order to write meta correctly meta = {} for r in res: if len(r['cases']) > 1: logger.error("File with ID %s has multiple case ID matches", r['file_id']) cid = r['cases'][0]['case_id'] fid = r['file_id'] fname = r['file_name'] outfn = os.path.join(outdir, cid, fname) meta.setdefault(cid, []) logger.info("Case %s. File ID %s. Output path %s.", cid, fid, outfn) try: download_data(fid, outfn, legacy=True) except Exception: logger.exception("Failed to download %s for case id %s", fname, cid) num_error += 1 else: logger.info("Downloaded case ID %s file ID %s to %s", cid, fid, outfn) meta[cid].append(r) num_files += 1 logger.info("Downloaded %d files. Encountered %d errors.", num_files, num_error) num_meta = 0 num_meta_errors = 0 # write meta files for cid, arr in meta.iteritems(): meta_fn = os.path.join(outdir, cid, 'meta.json') if os.path.exists(meta_fn): logger.error("Meta file already exists: %s", meta_fn) num_meta_errors += 1 else: with open(meta_fn, 'wb') as f: json.dump(arr, f) num_meta += 1 logger.info("Create %d meta files. Encountered %d errors.", num_meta, num_meta_errors)
read_dir = sys.argv[1] ref_fn = sys.argv[2] out_dir = sys.argv[3] qmap_args = sys.argv[4:] # put global log output into the output directory log_fn = os.path.join(out_dir, 'qualimap') if not os.path.isdir(read_dir): raise ValueError("Could not find specified read directory %s" % read_dir) if not os.path.isdir(out_dir): os.makedirs(out_dir) logger = get_file_logger(__name__, log_fn) # fastq.gz file discovery rr = re.compile(r'\.bam$', flags=re.IGNORECASE) flist = [t for t in os.listdir(read_dir) if re.search(rr, t)] # check for existing output and identify files fl = {} for t in flist: base = re.sub(r'\.bam$', '', t) out_subdir = os.path.join(out_dir, base) # if output folder exists, log warning and skip if os.path.isdir(out_subdir): logger.warn("Folder already exists: %s. Skipping.", out_subdir) continue else: os.makedirs(out_subdir)
# {'base_dir': '2017-08-23', # 'chipType': 'EPIC', # 'sampleType': 'KRYO DNA'}, # {'base_dir': '2017-09-19', # 'chipType': 'EPIC', # 'sampleType': 'KRYO DNA'}, {'base_dir': 'GSE92462_450K', 'chipType': '450K', 'sampleType': 'KRYO DNA'}, ] # include_subdir = True include_subdir = False n_retry = 3 wait_between_retries = 5 # seconds outdir = unique_output_dir('heidelberg_bulk_upload', reuse_empty=True) flog = get_file_logger('heidelberg_bulk_upload', os.path.join(outdir, 'automated_upload.log')) # include = [ # 'GBM018 P10 DNA 8/11/2016 CLEANED', # 'GBM019 P4 DNA 8/11/2016 CLEANED', # 'GBM024 P9 DNA 13/10/2016', # 'GBM026 P8 DNA 24/10/2016', # 'GBM031 P4 DNA 13/10/2016' # ] include = [ 'GSC80', 'GSC164', 'GSC64', 'GSC76', 'GSC102', 'GSC6', 'GSC14',