예제 #1
0
    def __init__(self, url_list, brand_names=[]):
        for i in url_list:
            self.urls.append(check_url(i))
        for i in brand_names:
            self.brand_names.append(i)

        user_agent = 'Mozilla/6.0 (compatible; MSIE 5.5; Windows NT)'
        self.headers = { 'User-Agent' : user_agent }
        self.log = get_file_logger('ju_report')
예제 #2
0
    def __init__(self, url_list, brand_names=[]):
        for i in url_list:
            self.urls.append(check_url(i))
        for i in brand_names:
            self.brand_names.append(i)

        user_agent = 'Mozilla/6.0 (compatible; MSIE 5.5; Windows NT)'
        self.headers = {'User-Agent': user_agent}
        self.log = get_file_logger('ju_report')
예제 #3
0
def main():
    log = get_file_logger('ju_item_insert')
    hostaddress = 'localhost'
    database = 'ju_db'
    user = MYSQL_USER
    password = MYSQL_PASSWORD
    db = torndb.Connection(hostaddress, database, user, password)

    excel_handler = DealExcel(INPUT_FILE_PATH, 'Sheet1')
    ju_brands = excel_handler.read_column_excel(1, 2)
    ju_urls = excel_handler.read_column_excel(2, 2)
    ju_pages = GetPageData(ju_urls, ju_brands)
    result = []
    for i, j in zip(ju_urls, ju_brands):
        result.append(ju_pages.get_page(i, j))
    # 在页面中抓取出floors
    floors = []
    for index, item in enumerate(result):
        floors.append(GetJuFloor(item['data'], item['title']).get_floors())

    values = []
    time_start = time.strftime('%Y-%m-%d-%H:%M:%S',time.localtime(time.time()))

    for item in floors:
        row_big_item = []
        row_small_item = []
        row_big_item = GetJuItem(item.get('big'), item['brand_name']).get_big_items()
        values.extend(row_big_item)
        small_pages = GetPageData(item['small'].get('urls'), item['brand_name']).get_pages()
        for i in small_pages:
            row_small_item.extend(GetJuItem(i, item['brand_name']).get_small_items())
        values.extend(row_small_item)

    sql_item = "INSERT INTO ju_brand_item (id, name, description, created, price, orig_price, started, item_type, brand_name) \
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)"

    sql_num = "INSERT INTO ju_brand_item_num (item_id, sold_num) VALUES (%s, %s)"

    sql_item_detail = "INSERT INTO ju_brand_item_detail (item_id, img_src, detail_src) VALUES (%s, %s, %s)"

    db_item = []
    db_item_num = []
    db_item_detail = []
    for value in values:
        try:
            if value['str_people'] == u'\u4eba\u5df2\u4e70\n':
                is_started = True
            else:
                is_started = False
            item_id = string.atoi(re.findall(r'\d+', value['src_detail'])[0])
            db_item.append([item_id, value['name'], value['desc'], value['date_time'], value['price'], value['orig_price'], is_started, value['item_type'], value['brand_name']])
            db_item_num.append([item_id, value['sold_num']])
            db_item_detail.append([item_id, value['img_src'], value['src_detail']])
        except Exception, e:
            log.error(str(value['name']))
            continue
예제 #4
0
 def set_logger(self):
     created_dir = False
     if not os.path.isdir(self.log_dir):
         try:
             os.makedirs(self.log_dir)
             created_dir = True
         except Exception:
             raise AttributeError(
                 "Failed to create log output directory %s" % self.log_dir)
     self.logger = get_file_logger(
         name=self.title,
         filestem=os.path.join(self.log_dir,
                               '%s.%s.log' % (self.title, self.now_str)))
     if created_dir:
         self.logger.info("Created log output directory %s.", self.log_dir)
예제 #5
0
class GetPageData(object):
    '''
    输入url抓取网页信息,并保存unicode格式的文本信息
    '''
    urls = []
    brand_names = []
    log = get_file_logger('get_page')

    def __init__(self, url_list, brand_names=[]):
        for i in url_list:
            self.urls.append(check_url(i))
        for i in brand_names:
            self.brand_names.append(i)

        user_agent = 'Mozilla/6.0 (compatible; MSIE 5.5; Windows NT)'
        self.headers = {'User-Agent': user_agent}

    def get_page(self, url='', page_title=u'', decode_str='gbk'):
        """
        将url对应的界面数据以及名称打包成结果返回
        """
        try:
            page = ''
            if not url.strip():
                raise Exception('url is None')
            req = urllib2.Request(url, headers=self.headers)
            response = urllib2.urlopen(req)
            html = response.read()
            page = html.decode(decode_str)
        except Exception, e:
            self.log.error(e)
            return None

        result = {
            'data': page,
            'title': page_title,
        }
        return result
예제 #6
0
 def __init__(self, page, brand_name):
     self.unicode_page = check_page(page)
     self.log = get_file_logger('ju_report')
     self.brand_name = brand_name
예제 #7
0
 def __init__(self, file_name, sheet_name):
     self.file_name = file_name
     self.sheet_name = sheet_name
     self.log = get_file_logger('excel_handler')
예제 #8
0
 def __init__(self, file_name, sheet_name):
     self.file_name = file_name
     self.sheet_name = sheet_name
     self.log = get_file_logger('excel_handler')
예제 #9
0
def main():
    log = get_file_logger('ju_item_insert')
    hostaddress = 'localhost'
    database = 'ju_db'
    user = MYSQL_USER
    password = MYSQL_PASSWORD
    db = torndb.Connection(hostaddress, database, user, password)

    excel_handler = DealExcel(INPUT_FILE_PATH, 'Sheet1')
    ju_brands = excel_handler.read_column_excel(1, 2)
    ju_urls = excel_handler.read_column_excel(2, 2)
    ju_pages = GetPageData(ju_urls, ju_brands)
    result = []
    for i, j in zip(ju_urls, ju_brands):
        result.append(ju_pages.get_page(i, j))
    # 在页面中抓取出floors
    floors = []
    for index, item in enumerate(result):
        floors.append(GetJuFloor(item['data'], item['title']).get_floors())

    values = []
    time_start = time.strftime('%Y-%m-%d-%H:%M:%S',
                               time.localtime(time.time()))

    for item in floors:
        row_big_item = []
        row_small_item = []
        row_big_item = GetJuItem(item.get('big'),
                                 item['brand_name']).get_big_items()
        values.extend(row_big_item)
        small_pages = GetPageData(item['small'].get('urls'),
                                  item['brand_name']).get_pages()
        for i in small_pages:
            row_small_item.extend(
                GetJuItem(i, item['brand_name']).get_small_items())
        values.extend(row_small_item)

    sql_item = "INSERT INTO ju_brand_item (id, name, description, created, price, orig_price, started, item_type, brand_name) \
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)"

    sql_num = "INSERT INTO ju_brand_item_num (item_id, sold_num) VALUES (%s, %s)"

    sql_item_detail = "INSERT INTO ju_brand_item_detail (item_id, img_src, detail_src) VALUES (%s, %s, %s)"

    db_item = []
    db_item_num = []
    db_item_detail = []
    for value in values:
        try:
            if value['str_people'] == u'\u4eba\u5df2\u4e70\n':
                is_started = True
            else:
                is_started = False
            item_id = string.atoi(re.findall(r'\d+', value['src_detail'])[0])
            db_item.append([
                item_id, value['name'], value['desc'], value['date_time'],
                value['price'], value['orig_price'], is_started,
                value['item_type'], value['brand_name']
            ])
            db_item_num.append([item_id, value['sold_num']])
            db_item_detail.append(
                [item_id, value['img_src'], value['src_detail']])
        except Exception, e:
            log.error(str(value['name']))
            continue
예제 #10
0
 def set_logger(self):
     self.logger = get_file_logger(
         name=self.title,
         filestem=os.path.join(self.log_dir,
                               '%s.%s.log' % (self.title, self.now_str)))
예제 #11
0
 def __init__(self, floor_data, brand_name):
     self.log = get_file_logger('ju_report')
     self.floor_data = floor_data
     self.brand_name = brand_name
예제 #12
0
 def __init__(self, page, brand_name):
     self.unicode_page = check_page(page)
     self.log = get_file_logger('ju_report')
     self.brand_name = brand_name
예제 #13
0
def get_methylation_gene_expression_data(case_ids):
    outdir = unique_output_dir("gdc-nih_gene_expr", reuse_empty=True)
    logger = get_file_logger("nih_methylation_gene_counts",
                             os.path.join(outdir, "getter.log"))

    # get relevant files for download

    qry_case = {
        "op": "in",
        "content": {
            "field": "cases.case_id",
            "value": list(case_ids)
        }
    }

    qry = {
        "filters":
        and_query(qry_primary, qry_case, or_query(qry_trans, qry_meth450)),
        "format":
        "json",
        "fields":
        ','.join(FILE_FIELDS),
        "size":
        10000
    }
    response = requests.post(FILES_ENDPOINT, json=qry)
    if response.status_code != 200:
        logger.error("Initial query failed: %s", response.content)
        raise ValueError("Query failed")
    res = response.json()['data']['hits']

    meta = {}
    num_error = 0
    num_files = 0

    for r in res:
        if len(r['cases']) > 1:
            logger.error("File with ID %s has multiple case ID matches",
                         r['file_id'])
        cid = r['cases'][0]['case_id']
        fid = r['file_id']
        fname = r['file_name']
        meta.setdefault(cid, {})
        if r['data_type'] == 'Gene Expression Quantification':
            if 'FPKM-UQ' in fname:
                continue
            elif 'FPKM' in fname:
                meta[cid]['fpkm'] = r
                outfn = os.path.join(outdir, cid, 'fpkm.gz')
            elif 'htseq.counts' in fname:
                meta[cid]['counts'] = r
                outfn = os.path.join(outdir, cid, 'counts.gz')
        elif r['data_type'] == 'Methylation Beta Value':
            meta[cid]['methylation'] = r
            outfn = os.path.join(outdir, cid, 'methylation.txt')
        try:
            download_data(fid, outfn)
        except Exception:
            logger.exception("Failed to download %s for case id %s", fname,
                             cid)
            num_error += 1
        else:
            logger.info("Downloaded case ID %s file ID %s to %s", cid, fid,
                        outfn)
            num_files += 1

    logger.info("Downloaded %d files. Encountered %d errors.", num_files,
                num_error)

    # run back through and write meta files

    num_meta = 0
    num_meta_errors = 0

    # write meta files
    for cid, d in meta.iteritems():
        meta_fn = os.path.join(outdir, cid, 'meta.json')
        if os.path.exists(meta_fn):
            logger.error("Meta file already exists: %s", meta_fn)
            num_meta_errors += 1
        else:
            with open(meta_fn, 'wb') as f:
                json.dump(d, f)
            num_meta += 1
    logger.info("Create %d meta files. Encountered %d errors.", num_meta,
                num_meta_errors)
예제 #14
0
def get_legacy_idat(case_ids):
    outdir = unique_output_dir("gdc-nih_methylation", reuse_empty=True)
    logger = get_file_logger("legacy_idats",
                             os.path.join(outdir, "getter.log"))

    qry_case = in_query("cases.case_id", case_ids)

    qry_idat = equal_query("files.data_format", "idat")

    qry = {
        "filters": and_query(qry_primary, qry_case, qry_idat, qry_meth450),
        "format": "json",
        "fields": ','.join(FILE_FIELDS),
        "size": 10000
    }

    response = requests.post(LEGACY_FILES_ENDPOINT, json=qry)
    if response.status_code != 200:
        logger.error("Initial query failed: %s", response.content)
        raise ValueError("Query failed")

    res = response.json()['data']['hits']
    logger.info("Found %d idat files.", len(res))

    num_error = 0
    num_files = 0

    # we need to keep track of the files in order to write meta correctly
    meta = {}

    for r in res:
        if len(r['cases']) > 1:
            logger.error("File with ID %s has multiple case ID matches",
                         r['file_id'])
        cid = r['cases'][0]['case_id']
        fid = r['file_id']
        fname = r['file_name']
        outfn = os.path.join(outdir, cid, fname)
        meta.setdefault(cid, [])
        logger.info("Case %s. File ID %s. Output path %s.", cid, fid, outfn)
        try:
            download_data(fid, outfn, legacy=True)
        except Exception:
            logger.exception("Failed to download %s for case id %s", fname,
                             cid)
            num_error += 1
        else:
            logger.info("Downloaded case ID %s file ID %s to %s", cid, fid,
                        outfn)
            meta[cid].append(r)
            num_files += 1

    logger.info("Downloaded %d files. Encountered %d errors.", num_files,
                num_error)

    num_meta = 0
    num_meta_errors = 0

    # write meta files
    for cid, arr in meta.iteritems():
        meta_fn = os.path.join(outdir, cid, 'meta.json')
        if os.path.exists(meta_fn):
            logger.error("Meta file already exists: %s", meta_fn)
            num_meta_errors += 1
        else:
            with open(meta_fn, 'wb') as f:
                json.dump(arr, f)
            num_meta += 1
    logger.info("Create %d meta files. Encountered %d errors.", num_meta,
                num_meta_errors)
예제 #15
0
 def __init__(self, floor_data, brand_name):
     self.log = get_file_logger('ju_report')
     self.floor_data = floor_data
     self.brand_name = brand_name
    read_dir = sys.argv[1]
    ref_fn = sys.argv[2]
    out_dir = sys.argv[3]
    qmap_args = sys.argv[4:]

    # put global log output into the output directory
    log_fn = os.path.join(out_dir, 'qualimap')

    if not os.path.isdir(read_dir):
        raise ValueError("Could not find specified read directory %s" %
                         read_dir)
    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)

    logger = get_file_logger(__name__, log_fn)

    # fastq.gz file discovery
    rr = re.compile(r'\.bam$', flags=re.IGNORECASE)
    flist = [t for t in os.listdir(read_dir) if re.search(rr, t)]
    # check for existing output and identify files
    fl = {}
    for t in flist:
        base = re.sub(r'\.bam$', '', t)
        out_subdir = os.path.join(out_dir, base)
        # if output folder exists, log warning and skip
        if os.path.isdir(out_subdir):
            logger.warn("Folder already exists: %s. Skipping.", out_subdir)
            continue
        else:
            os.makedirs(out_subdir)
예제 #17
0
     # {'base_dir': '2017-08-23',
     #  'chipType': 'EPIC',
     #  'sampleType': 'KRYO DNA'},
     # {'base_dir': '2017-09-19',
     #  'chipType': 'EPIC',
     #  'sampleType': 'KRYO DNA'},
     {'base_dir': 'GSE92462_450K',
      'chipType': '450K',
      'sampleType': 'KRYO DNA'},
 ]
 # include_subdir = True
 include_subdir = False
 n_retry = 3
 wait_between_retries = 5  # seconds
 outdir = unique_output_dir('heidelberg_bulk_upload', reuse_empty=True)
 flog = get_file_logger('heidelberg_bulk_upload', os.path.join(outdir, 'automated_upload.log'))
 # include = [
 #     'GBM018 P10 DNA 8/11/2016 CLEANED',
 #     'GBM019 P4 DNA 8/11/2016 CLEANED',
 #     'GBM024 P9 DNA     13/10/2016',
 #     'GBM026 P8 DNA 24/10/2016',
 #     'GBM031 P4 DNA     13/10/2016'
 # ]
 include = [
     'GSC80',
     'GSC164',
     'GSC64',
     'GSC76',
     'GSC102',
     'GSC6',
     'GSC14',