Пример #1
0
def get_comment_info():
    global info_list
    pagenum = 1
    while(True):
        print(pagenum)
        url = "https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg"

        querystring = {"biztype":"1","topid":"237773700","cmd":"8","pagenum":pagenum,"pagesize":"25"}

        response = requests.request("GET", url, params=querystring)

        resp = json.loads(response.text)

        commentlist = resp.get('comment').get('commentlist')

        if not commentlist or len(commentlist) == 0:
            return

        for comment in commentlist:
            info = []
            one_name = comment.get('nick')
            # 将 UNIX 时间戳转化为普通时间格式
            if comment.get('time') < 1568735760:
                return
            time_local = time.localtime(comment.get('time'))
            one_time = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
            one_praisenum = comment.get('praisenum')
            one_comment = comment.get('rootcommentcontent')
            ILLEGAL_CHARACTERS_RE.sub(r'', one_comment)
            ILLEGAL_CHARACTERS_RE.sub(r'', one_name)
            info = [one_name, one_time, one_praisenum, one_comment]
            # print(info)
            info_list.append(info)
        pagenum += 1
Пример #2
0
    def export_csv(self):
        # 导出成csv格式
        # 获取列名作为标题
        self.conn.cursorclass = pymysql.cursors.DictCursor
        with self.conn.cursor() as cursor:
            cursor.execute(self.sql)
            title = []
            for column_name in cursor.fetchone():
                title.append(column_name)

        with open(r'%s' % self.file, 'w', newline='',
                  encoding='utf-8') as csvfile:
            fieldnames = title
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            # 获取数据,并写入到表格
            if self.affected_row <= 100000:
                # 当导出数据量小于10W时,使用fetchall直接读取到内存中
                self.conn.cursorclass = pymysql.cursors.DictCursor
                with self.conn.cursor() as cursor:
                    msg = f'正在导出SQL:{self.sql}'
                    self.pull_msg(msg)
                    self.execute_log.append(msg)

                    cursor.execute(self.sql)
                    rows = cursor.fetchall()

                    msg = f'正在处理数据\n编码为:UTF-8'
                    self.pull_msg(msg)
                    self.execute_log.append(msg)

                    for row in rows:
                        # 过滤掉特殊字符
                        for k, v in row.items():
                            filter_illegal_characters_value = ILLEGAL_CHARACTERS_RE.sub(
                                r'', v) if isinstance(v, str) else v
                            row[k] = filter_illegal_characters_value
                        writer.writerow(row)
            elif self.affected_row > 100000:
                # 当导出数据量大于10W时,使用SSCursor进行迭代读取
                self.conn.cursorclass = pymysql.cursors.SSDictCursor
                with self.conn.cursor() as cursor:
                    msg = f'正在导出SQL:{self.sql}'
                    self.pull_msg(msg)
                    self.execute_log.append(msg)

                    cursor.execute(self.sql)
                    while True:
                        row = cursor.fetchone()
                        if row:
                            # 过滤掉特殊字符
                            for k, v in row.items():
                                filter_illegal_characters_value = ILLEGAL_CHARACTERS_RE.sub(
                                    r'', v) if isinstance(v, str) else v
                                row[k] = filter_illegal_characters_value
                            writer.writerow(row)
                        else:
                            break
Пример #3
0
def remove_illgeal_characters(s):
    if not s:
        return u''
    # never 502
    # noinspection PyBroadException
    try:
        try:
            return ILLEGAL_CHARACTERS_RE.sub(r'', s)
        except UnicodeEncodeError:
            return ILLEGAL_CHARACTERS_RE.sub(r'', s.decode('utf-8'))
    except Exception:
        return u''
Пример #4
0
    def DB2xlsx(self):
        DBserver = pymongo.MongoClient('mongodb://localhost:27017/')
        DB = DBserver['DataDB']
        collection = DB['usermsgs']

        # 'username', 'mode', 'quiz_class', 'quiz_no', 'quiz_ans', 'Msgdate'
        # "編號", "執行順序", "遠1單雙", "遠2單雙", "吸管單雙", "寶特瓶單雙", "遠1得分", "遠2得分", "遠3得分", "吸管流暢", "吸管變通", "吸管獨創", "寶特瓶流暢", "寶特瓶變通", "寶特瓶獨創"

        output = {}
        for item in collection.find():
            datestr = (item['Msgdate'] +
                       datetime.timedelta(hours=8)).strftime("%Y/%m/%d, %H")
            if output.get(datestr, -1) == -1:
                output[datestr] = []

            if ILLEGAL_CHARACTERS_RE.search(item['username']):
                print((item['Msgdate'] + datetime.timedelta(hours=8)
                       ).strftime("%Y/%m/%d, %H:%M:%S"))
                print(item['username'], 'username', sep=';')
            if ILLEGAL_CHARACTERS_RE.search(item['quiz_ans']):
                print((item['Msgdate'] + datetime.timedelta(hours=8)
                       ).strftime("%Y/%m/%d, %H:%M:%S"))
                print(item['quiz_ans'], 'quiz_ans', sep=';')

            data = [
                item['username'], item['mode'], item['quiz_class'],
                item['quiz_no'], item['quiz_ans'],
                (item['Msgdate'] +
                 datetime.timedelta(hours=8)).strftime("%Y/%m/%d, %H:%M:%S")
            ]
            output[datestr].append(data)

        if not os.path.exists('./output'):
            os.mkdir('output')
        if not os.path.exists('./output/Data ' +
                              datetime.date.today().strftime("%Y.%m.%d")):
            os.mkdir('./output/Data ' +
                     datetime.date.today().strftime("%Y.%m.%d"))

        for datestr in output.keys():
            file = pd.DataFrame(output[datestr],
                                columns=[
                                    "UserName", "Single/Double Mode",
                                    "Quiz Class", "Quiz #", "Ans", "Time"
                                ])
            s = './output/Data ' + datetime.date.today().strftime("%Y.%m.%d") + '/' + \
                (datestr.replace(", ", "-").replace("/", "-")) + '.xlsx'
            # print(s)
            file.to_excel(s, engine='xlsxwriter')
        print("done~")
Пример #5
0
def write_excel(schools, containers):
    # containers = ILLEGAL_CHARACTERS_RE.sub(r'', containers)
    print('正在保存文件.......')
    wb = Workbook()
    sheet0 = wb.create_sheet('学校统计数据', index=0)
    head = [
        '学校', '学校简介', '录取规则', '奖学金设置', '食宿条件', '联系方法', '收费项目', '毕业生就业', '体检要求',
        '其他'
    ]
    col0 = schools
    print('学校总数:', len(col0))
    for i, item in enumerate(head):
        sheet0.cell(row=1, column=i + 1, value=item)  # 表头
    for i in range(len(col0)):  #2820
        sheet0.cell(row=i + 2, column=1, value=col0[i])
        sheet0.cell(row=i + 2,
                    column=2,
                    value=ILLEGAL_CHARACTERS_RE.sub(r'',
                                                    str(containers[i][0])))
        sheet0.cell(row=i + 2,
                    column=3,
                    value=ILLEGAL_CHARACTERS_RE.sub(r'',
                                                    str(containers[i][1])))
        sheet0.cell(row=i + 2,
                    column=4,
                    value=ILLEGAL_CHARACTERS_RE.sub(r'',
                                                    str(containers[i][2])))
        sheet0.cell(row=i + 2,
                    column=5,
                    value=ILLEGAL_CHARACTERS_RE.sub(r'',
                                                    str(containers[i][3])))
        sheet0.cell(row=i + 2,
                    column=6,
                    value=ILLEGAL_CHARACTERS_RE.sub(r'',
                                                    str(containers[i][4])))
        sheet0.cell(row=i + 2,
                    column=7,
                    value=ILLEGAL_CHARACTERS_RE.sub(r'',
                                                    str(containers[i][5])))
        sheet0.cell(row=i + 2,
                    column=8,
                    value=ILLEGAL_CHARACTERS_RE.sub(r'',
                                                    str(containers[i][6])))
        sheet0.cell(row=i + 2,
                    column=9,
                    value=ILLEGAL_CHARACTERS_RE.sub(r'',
                                                    str(containers[i][7])))
        sheet0.cell(row=i + 2,
                    column=10,
                    value=ILLEGAL_CHARACTERS_RE.sub(r'',
                                                    str(containers[i][8])))
        #保存文件
    wb.save('学校统计数据.xlsx')
Пример #6
0
 def insert_to_excel(self, results, tags):
     wb = Workbook(optimized_write=True)
     ws = []
     for i in range(len(tags)):
         ws.append(wb.create_sheet(title=tags[i]))
     for i in range(len(tags)):
         ws[i].append(['序号', '书名', '评分', '评价人数', '作者/译者/出版社/出版时间/价格', '链接'])
         count = 1
         for booklist in results[i]:
             try:
                 book_list = []
                 for each in booklist:
                     each = ILLEGAL_CHARACTERS_RE.sub(r'', each)
                     book_list.append(each)
                 if book_list:
                     ws[i].append([
                         count, book_list[0],
                         float(book_list[1]),
                         int(book_list[2]), book_list[3], book_list[4]
                     ])
                     count += 1
                 else:
                     print('Fail to save i:', i, book_list)
             except:
                 print('Fail to save i:', i, booklist)
                 pass
     save_path = 'book_lists'
     for i in range(len(tags)):
         save_path += ('-' + tags[i])
     save_path += '.xlsx'
     wb.save(save_path)
Пример #7
0
 def insert_to_excel(self, results, tags):
     wb = Workbook(optimized_write=True)
     ws = []
     for i in range(len(tags)):
         ws.append(wb.create_sheet(title=tags[i]))
     for i in range(len(tags)):
         ws[i].append(
             ['序号', '电影名', '评分', '评价人数', '制片地区/类型/上映年份/导演/主演', '链接'])
         count = 1
         for movielist in results[i]:
             try:
                 movie_list = []
                 for each in movielist:
                     each = ILLEGAL_CHARACTERS_RE.sub(r'', each)
                     movie_list.append(each)
                 if movie_list:
                     ws[i].append([
                         count, movie_list[0],
                         float(movie_list[1]),
                         int(movie_list[2]), movie_list[3], movie_list[4]
                     ])
                     count += 1
                 else:
                     print('Fail to save i:', i, movie_list)
             except:
                 print('Fail to save i:', i, movielist)
                 pass
     save_path = 'movie_lists'
     for i in range(len(tags)):
         save_path += ('-' + tags[i])
     save_path += '.xlsx'
     wb.save(save_path)
Пример #8
0
def predict(text):
    global index
    text = str(text).strip()
    token_ids = tokenizer.encode(ILLEGAL_CHARACTERS_RE.sub(r'', text),
                                 max_length=256,
                                 pad_to_max_length=True)
    token_mask = get_atten_mask(token_ids)

    token_segment_type = tokenizer.create_token_type_ids_from_sequences(
        token_ids_0=token_ids[1:-1])

    token_ids = torch.LongTensor(token_ids).unsqueeze(0)
    token_mask = torch.LongTensor(token_mask).unsqueeze(0)
    token_segment_type = torch.LongTensor(token_segment_type).unsqueeze(0)

    inputs = {
        'input_ids': token_ids,
        'token_type_ids': token_segment_type,
        'attention_mask': token_mask,
        # 'labels': batch[0]
    }
    logits = classifier(**inputs)
    _, predict = logits[0].max(1)
    # print(str(index) + news_id_type_dict[predict.item()])
    index += 1
    return news_id_type_dict[predict.item()]
Пример #9
0
 def _sanitize_value(self, raw_value):
     # prepend ' if raw_value is starting with possible malicious char
     if self.sanitize_fields and raw_value:
         str_value = str(raw_value)
         str_value = ILLEGAL_CHARACTERS_RE.sub('', str_value)   # remove ILLEGAL_CHARACTERS so it doesn't crash
         return "'" + str_value if str_value.startswith(ESCAPE_CHARS) else str_value
     return raw_value
Пример #10
0
    def write_sheet(self, sheet_name, sheet):

        worksheet = odf.table.Table(name=sheet_name)
        sheet_header = list(sheet)

        header_row = odf.table.TableRow()

        for header in sheet_header:
            header_row.addElement(self._make_cell(header))

        worksheet.addElement(header_row)

        for sheet_line in sheet.lines:
            row = odf.table.TableRow()
            for header in sheet_header:
                value = sheet_line.get(header)
                if isinstance(value, str):
                    new_value = ILLEGAL_CHARACTERS_RE.sub("", value)
                    if new_value != value:
                        warn(
                            _("Character(s) in '{}' are not allowed in a spreadsheet cell. Those character(s) will be removed"
                              ).format(value),
                            DataErrorWarning,
                        )
                    value = new_value
                row.addElement(self._make_cell(value))
            worksheet.addElement(row)

        self.workbook.spreadsheet.addElement(worksheet)
Пример #11
0
def text_predict(examples, model, tokenizer, id2label):
    model.eval()
    labels_list = []
    for text in examples:
        text = str(text).strip()
        sentencses = ILLEGAL_CHARACTERS_RE.sub(r'', text)
        sequence_dict = tokenizer.encode_plus(sentencses, max_length=args.max_length,
                                              pad_to_max_length=True, truncation=True)
        token_ids = sequence_dict['input_ids']
        token_mask = sequence_dict['attention_mask']
        token_segment_type = tokenizer.create_token_type_ids_from_sequences(token_ids_0=token_ids[1:-1])

        token_ids = torch.LongTensor(token_ids).unsqueeze(0)
        token_mask = torch.LongTensor(token_mask).unsqueeze(0)
        token_segment_type = torch.LongTensor(token_segment_type).unsqueeze(0)
        with torch.no_grad():
            inputs = {
                'input_ids': token_ids,
                'token_type_ids': token_segment_type,
                'attention_mask': token_mask
            }
            logits = model(**inputs)

        _, predict = logits[0].max(1)
        label = id2label[predict.item()]
        labels_list.append(label)
    return labels_list
Пример #12
0
    def _exprt_to_csv(self, cnx):
        # 导出csv的文件
        # 推送消息
        msg = f'正在执行导出SQL: {self.sql} \n'
        self.pm.pull(msg=msg)
        self.execute_log.append(msg)

        # 打开csv文件
        with open(self.tmp_file, 'w', newline='', encoding='utf-8') as csvfile:
            # 使用游标读取数据,避免数据量过大产生OOM
            cnx.cursorclass = pymysql.cursors.SSDictCursor
            with cnx.cursor() as cursor:
                cursor.execute(self.sql)
                # 推送消息
                msg = f'正在处理并生成CSV数据 \n'
                self.pm.pull(msg=msg)
                self.execute_log.append(msg)
                # 标题
                fieldnames = ([x[0] for x in cursor.description])
                # 返回行数
                self.result['affected_rows'] = cursor.rownumber
                # 实例化csv
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()
                # 操作数据
                while True:
                    row = cursor.fetchone()
                    if not row:
                        break
                    # 过滤掉特殊字符
                    for k, v in row.items():
                        filter_illegal_characters_value = ILLEGAL_CHARACTERS_RE.sub(r'', v) \
                            if isinstance(v, str) else v
                        row[k] = filter_illegal_characters_value
                    writer.writerow(row)
Пример #13
0
def get_app_des(app_id, country):
    data = {'appid': app_id, 'country': country}
    params = {
        'analysis': analysis_parser(data, url='/app/baseinfo'),
        'appid': data['appid'],
        'country': data['country']
    }
    res = requests.get(url='https://api.qimai.cn/app/baseinfo',
                       params=params,
                       headers=headers)
    html = res.text
    html = json.loads(html)
    # print(html)
    description = html['description']
    description = re.sub(r'<br />', '', description)
    description = ILLEGAL_CHARACTERS_RE.sub(r'', description)
    appinfo = str(html['appInfo'])
    try:
        yy = re.findall(r"'name': '支持语言', 'value': '(.*?)'}", appinfo)[0]
    except:
        yy = ''
    try:
        start_date = re.findall(r"'name': '发布日期', 'value': '(.*?)'}",
                                appinfo)[0]
    except:
        start_date = ''
    return (yy, description, start_date)
Пример #14
0
def getComment(driver, url, song):
    driver.get(url)
    sleep(1)
    driver.switch_to.frame('g_iframe')
    try:
        client = driver.find_elements_by_xpath(
            '/html/body/div[3]/div[1]/div/div/div[2]/div/div[2]/div[2]/div/div[2]/div[1]/div/a'
        )
        time = driver.find_elements_by_xpath(
            '/html/body/div[3]/div[1]/div/div/div[2]/div/div[2]/div[2]/div/div[2]/div[@class="rp"]/div'
        )
        thumbs = driver.find_elements_by_xpath(
            '/html/body/div[3]/div[1]/div/div/div[2]/div/div[2]/div[2]/div/div[2]/div[@class="rp"]/a[1]'
        )
        text = driver.find_elements_by_xpath(
            '/html/body/div[3]/div[1]/div/div/div[2]/div/div[2]/div[2]/div/div[2]'
        )
    except:
        print('error')
        return None
    print('正在爬取:' + song)
    li = []
    for n, i in enumerate(time):
        if n >= 15:
            break
        if re.search(r'[(](.*)[)]', thumbs[n].text) == None:
            thumb = '0'
        else:
            thumb = re.findall(r'[(](.*)[)]', thumbs[n].text)[0]
        if re.search(r'万', thumb) != None:
            thumb = thumb.split('万')[0]
            thumb = int(float(thumb) * 10000)
        else:
            thumb = int(thumb)
        comment = {
            '歌曲':
            song,
            '用户':
            client[n].text,
            '内容':
            ILLEGAL_CHARACTERS_RE.sub(
                r'',
                re.findall(r':(.*)' + i.text,
                           text[n].get_attribute('textContent'))[0]),
            '时间':
            i.text,
            '点赞数':
            thumb,
            '类型':
            '评论',
            '关联评论':
            '无'
        }
        if re.search(r'◆◆', comment['内容']) != None:
            tu = re.findall(r'(.*)◆◆(.*)', comment['内容'])[0]
            comment['内容'] = tu[0]
            comment['类型'] = '回复'
            comment['关联评论'] = tu[1]
        li.append(comment)
    return li
Пример #15
0
    def __init__(self,
                 file_path,
                 tokenizer: BertTokenizer,
                 max_length=512,
                 device=None):
        news_type = []
        news_content = []
        news_atten_mask = []
        seq_typ_ids = []
        with open(file_path, mode='r', encoding='utf8') as f:
            for line in tqdm(f.readlines()):
                line = line.strip()
                line = line.split('\t')

                news_type.append(news_type2id_dict[line[0]])
                token_ids = tokenizer.encode(ILLEGAL_CHARACTERS_RE.sub(
                    r'', line[1]),
                                             max_length=max_length,
                                             pad_to_max_length=True)
                news_content.append(token_ids)
                news_atten_mask.append(get_atten_mask(token_ids))
                seq_typ_ids.append(
                    tokenizer.create_token_type_ids_from_sequences(
                        token_ids_0=token_ids[1:-1]))

        self.label = torch.from_numpy(np.array(news_type)).unsqueeze(1).long()
        self.token_ids = torch.from_numpy(np.array(news_content)).long()
        self.seq_type_ids = torch.from_numpy(np.array(seq_typ_ids)).long()
        self.atten_masks = torch.from_numpy(np.array(news_atten_mask)).long()
        if device is not None:
            self.label = self.label.to(device)
            self.token_ids = self.token_ids.to(device)
            self.seq_type_ids = self.seq_type_ids.to(device)
            self.atten_masks = self.atten_masks.to(device)
def writeExcel():
    outwb = openpyxl.Workbook()  # 打开一个将写的文件
    outws = outwb.create_sheet(index=0)  # 在将写的文件创建sheet
    for row in range(1,70000):
        for col in range(1,4):
            outws.cell(row, col).value = ILLEGAL_CHARACTERS_RE.search("", "{}".format(row*2))  # 写文件
        print(row)
    saveExcel = "D:\\test2.xlsx"
    outwb.save(saveExcel)  # 一定要记得保存
Пример #17
0
def sanitize_value(value):
    # prepend ' if value is starting with possible malicious char
    if value:
        str_value = str(value)
        str_value = ILLEGAL_CHARACTERS_RE.sub(
            "", str_value)  # remove ILLEGAL_CHARACTERS so it doesn't crash
        return "'" + str_value if str_value.startswith(
            ESCAPE_CHARS) else str_value
    return value
Пример #18
0
def news_data_process(file_path):
    news_type = []
    news_content = []
    with open(file_path, mode='r', encoding='utf8') as f:
        for line in f.readlines():
            line = line.strip()
            line = line.split('\t')
            news_type.append(line[0])
            news_content.append(ILLEGAL_CHARACTERS_RE.sub(r'', line[1]))
    return news_type, news_content
Пример #19
0
 def write_row(self, row):
     self.row_count += 1
     column_count = 0
     for cell_value in row:
         # 处理非法字符
         column_count += 1
         cell_value = ILLEGAL_CHARACTERS_RE.sub(r'', cell_value)
         self.ws.cell(row=self.row_count,
                      column=column_count,
                      value=cell_value)
Пример #20
0
    def _export_to_xlsx(self, cnx):
        # 导出xlsx格式的文件,使用wirte_only能够有效降低内存的使用
        wb = Workbook(write_only=True)
        wb.encoding = self.encoding
        ws = wb.create_sheet()
        ws.title = self.title

        # 推送消息
        msg = f'正在执行导出SQL: {self.sql} \n'
        self.pm.pull(msg=msg)
        self.execute_log.append(msg)

        # 使用游标读取数据,避免数据量过大产生OOM
        if not self.clickhouse:
            cnx.cursorclass = pymysql.cursors.SSCursor
        with cnx.cursor() as cursor:
            if self.clickhouse:
                cursor.set_stream_results(True, 1000)
            cursor.execute(self.sql)
            # 推送消息
            msg = f'正在处理并生成XLSX数据 \n'
            self.pm.pull(msg=msg)
            self.execute_log.append(msg)
            # 标题
            if self.clickhouse:
                header = handle_duplicate_column(
                    [x.name for x in cursor.description])
            else:
                header = [x[0] for x in cursor.description]
            ws.append(header)

            # 返回行数
            if self.clickhouse:
                self.result['affected_rows'] = cursor.rowcount
            else:
                self.result['affected_rows'] = cursor.rownumber
            # 操作数据
            while True:
                row = cursor.fetchone()
                if not row:
                    break

                # 过滤掉特殊字符
                filter_illegal_characters_row = list(
                    map((lambda x: ILLEGAL_CHARACTERS_RE.sub(r'', x)
                         if isinstance(x, str) else x), row))
                # 处理科学计数法
                _row = [
                    self.correct_int_row(x)
                    for x in filter_illegal_characters_row
                ]
                ws.append(_row)

        # 保存到文件
        wb.save(self.tmp_file)
def main(wb, session, OrderModel, WebsiteModel):
    print('开始爬取码市订单')
    sheet = wb['Sheet']
    sheet.title = '码市'
    sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员'])
    count = 1
    website = session.query(WebsiteModel).get(1)
    for i in range(10, 0, -1):
        url = 'https://codemart.com/api/project?page=%d' % i
        result = get_one_page(url)
        if isinstance(result, list):
            for r in result:
                time_stamp = int(r['pubtime']) / 1000
                publish_time = datetime.fromtimestamp(time_stamp)
                if publish_time < time_point:
                    continue
                desc = ILLEGAL_CHARACTERS_RE.sub(r'', r['description'])
                cid = 'cm-{}'.format(r['id'])
                contact = get_contact(desc)
                link = 'https://codemart.com/project/{}'.format(r['id'])
                is_valid = True if r['status'] == '招募中' else False
                order_query = session.query(OrderModel).get(cid)
                if order_query:
                    is_valided = order_query.is_valid
                    order_query.is_valid = is_valid
                    if is_valid == True:
                        sheet.append(
                            [count, desc, link, publish_time, contact, ''])
                        count += 1
                        if is_valided == False:
                            order_query.is_delete = False
                    if is_valided == True and is_valid == False:
                        order_query.is_delete = True
                else:
                    order = OrderModel(id=cid,
                                       desc=desc,
                                       link=link,
                                       contact=contact,
                                       category=r['cate'],
                                       pub_time=publish_time,
                                       is_valid=is_valid,
                                       is_delete=False if is_valid else True)
                    order.website = website
                    session.add(order)
                    if is_valid == True:
                        sheet.append(
                            [count, desc, link, publish_time, contact, ''])
                        count += 1
            session.commit()
        elif isinstance(result, tuple):
            message = '码市爬取第%d行出错:%s' % (result[0], result[1])
            print(message)
            send_message(message)
    print('结束爬取码市订单')
Пример #22
0
def writeFile(data, file):
    wb1 = Workbook()
    # print(wb1.get_sheet_names())
    # 新建了一个工作表,尚未保存
    sheet = wb1.active
    sheet.title = 'Sheet1'
    # 直接给单元格赋值
    # sheet.append(["fid", "fvipid", 'fadd_time', "fadddr"])
    sheet.append(["fadddr"])
    for item in data:
        # sheet.append([item.get("fid"), item.get("fvipid"), item.get('fadd_time'), ILLEGAL_CHARACTERS_RE.sub(r'', item.get("faddr"))])
        sheet.append([ILLEGAL_CHARACTERS_RE.sub(r'', item.get("faddr"))])
    wb1.save(file)
Пример #23
0
def write_content(path, ranking_sheet_name, ranking_content,
                  reviews_sheet_names, reviews_content):
    try:
        workbook = openpyxl.load_workbook(path)
    except:
        workbook = openpyxl.Workbook()
    sheet_name = format_sheet_name(ranking_sheet_name)
    try:
        sheet = workbook[sheet_name]
    except:
        sheet = workbook.create_sheet(sheet_name)
    for i in range(len(ranking_content)):
        for j in range(0, len(ranking_content[i])):
            tem_value = ILLEGAL_CHARACTERS_RE.sub(r'',
                                                  str(ranking_content[i][j]))
            sheet.cell(row=i + 1, column=j + 1, value=tem_value)
    for index in range(len(reviews_content)):
        values = reviews_content[index]
        if values is None:
            continue
        sheet_name = format_sheet_name(reviews_sheet_names[index])
        try:
            sheet = workbook[sheet_name]
        except:
            sheet = workbook.create_sheet(sheet_name)
        for i in range(0, len(values)):
            for j in range(0, len(values[i])):
                tem_value = ILLEGAL_CHARACTERS_RE.sub(r'', str(values[i][j]))
                sheet.cell(row=i + 1, column=j + 1, value=tem_value)
    try:
        if workbook.sheetnames[0] == 'Sheet':
            sheet = workbook[workbook.sheetnames[0]]
            workbook.remove(sheet)
    except:
        print(path, "Write fail")
        pass
    print(path, "Write success")
    workbook.save(path)
    workbook.close()
Пример #24
0
 def _create_examples(self, path):
     examples = []
     with open(path, mode='r', encoding='utf8') as f:
         for line in tqdm(enumerate(f.readlines())):
             id = line[0]
             line = line[1].strip()
             line = line.split('\t')
             label = line[0]
             text = line[1]
             text = ILLEGAL_CHARACTERS_RE.sub(r'', text)
             example = InputExample(guid=id, text_a=text, label=label)
             examples.append(example)
     return examples
Пример #25
0
def crawler():
    while not work.empty():  # 队列不为空时
        url = work.get_nowait()
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.text, 'html.parser')

        movie_list = soup.find(id='asyncRatingRegion')
        movies = movie_list.find_all('li')

        for movie in movies:
            name = movie.find('a')
            num = movie.find('div')
            info = movie.find_all('p')

            print('No.' + num.text)
            print('电影名' + name['title'])

            # 由于有些电影没有导演、演员、简介,所以加入try语句
            try:
                director = info[0].find('a')
                dir_ = ILLEGAL_CHARACTERS_RE.sub(r'', director.text)  # 可能有非法字符
            except Exception:
                dir_ = '无'

            try:
                actor = info[1].find('a')
                actor_ = ILLEGAL_CHARACTERS_RE.sub(r'', actor.text)
            except Exception:
                actor_ = '无'

            try:
                brief = info[2]
                brief_ = ILLEGAL_CHARACTERS_RE.sub(r'', brief.text)
            except Exception:
                brief_ = '无'

            movie_excel.append([num.text, name['title'], dir_, actor_, brief_])

            print('-------------\n')
Пример #26
0
def insert(result_list, title):
    wb = openpyxl.Workbook()
    ws = wb.active

    ws['A1'] = '分类'
    ws['B1'] = '地址'
    ws['C1'] = '标题'
    ws['D1'] = '数量'
    for result in result_list:
        result1 = [ILLEGAL_CHARACTERS_RE.sub(r'', str(i)) for i in result]
        result1[-1] = int(result1[-1])
        ws.append(result1)
    path = app_path() + '\\' + title + '.xlsx'
    wb.save(path)
Пример #27
0
    def _exprt_to_csv(self, cnx):
        # 导出csv的文件
        # 推送消息
        msg = f'正在执行导出SQL: {self.sql} \n'
        self.pm.pull(msg=msg)
        self.execute_log.append(msg)

        # 打开csv文件
        with open(self.tmp_file, 'w', newline='', encoding='utf-8') as csvfile:
            # 使用游标读取数据,避免数据量过大产生OOM
            if not self.clickhouse:
                cnx.cursorclass = pymysql.cursors.SSDictCursor
            with cnx.cursor() as cursor:
                if self.clickhouse:
                    cursor.set_stream_results(True, 1000)
                cursor.execute(self.sql)
                # 推送消息
                msg = f'正在处理并生成CSV数据 \n'
                self.pm.pull(msg=msg)
                self.execute_log.append(msg)
                # 标题
                if self.clickhouse:
                    header = handle_duplicate_column(
                        [x.name for x in cursor.description])
                else:
                    header = ([x[0] for x in cursor.description])
                fieldnames = header
                # 返回行数
                if self.clickhouse:
                    self.result['affected_rows'] = cursor.rowcount
                else:
                    self.result['affected_rows'] = cursor.rownumber

                # 实例化csv
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()
                # 操作数据
                while True:
                    row = cursor.fetchone()
                    if not row:
                        break
                    # 过滤掉特殊字符
                    if self.clickhouse:
                        # clickhouse流式不支持字典
                        row = dict(zip(header, row))
                    for k, v in row.items():
                        filter_illegal_characters_value = ILLEGAL_CHARACTERS_RE.sub(r'', v) \
                            if isinstance(v, str) else v
                        row[k] = filter_illegal_characters_value
                    writer.writerow(row)
Пример #28
0
 def write_sheet(self, sheet_name, sheet):
     sheet_header = list(sheet)
     worksheet = self.workbook.create_sheet()
     worksheet.title = self.sheet_prefix + sheet_name
     worksheet.append(sheet_header)
     for sheet_line in sheet.lines:
         line = []
         for header in sheet_header:
             value = sheet_line.get(header)
             if isinstance(value, six.text_type):
                 new_value = ILLEGAL_CHARACTERS_RE.sub('', value)
                 if new_value != value:
                     warn("Character(s) in '{}' are not allowed in a spreadsheet cell. Those character(s) will be removed".format(value),
                         DataErrorWarning)
                 value = new_value
             line.append(value)
         worksheet.append(line)
Пример #29
0
def parseHtml(html):

    html = html.replace('\\', '')
    html = ILLEGAL_CHARACTERS_RE.sub(r'', html)
    s = json.loads(html, strict=False)
    global line_index

    userInfo = []
    for key in s['userInfo']:
        line_index = line_index + 1
        a = (key['uid'], key['nickname'], key['age'], key['work_location'],
             key['height'], key['education'], key['matchCondition'],
             key['marriage'], key['shortnote'].replace('\n', ' '))
        userInfo.append(a)

    with open('sjjy.csv', 'a', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(userInfo)
Пример #30
0
def parse_pkg_html(file, app_infos):
    # html.parser是解析器,也可是lxml
    soup = BeautifulSoup(open(file, encoding='utf-8'),
                         features='html.parser',
                         from_encoding='UTF-8')
    table = soup.table
    if table is not None:
        # tbody节点
        tbody = table.tbody
        if tbody is not None:
            # tr节点
            for tr in tbody.find_all('tr'):
                # td节点
                tds = tr.find_all('td')
                if len(tds) > 1:
                    # UID
                    app_id = str(
                        str(tds[0]).split('<br', 1)[0].split('">',
                                                             1)[1].strip())
                    uid = int(str(tds[1].a.string).strip())
                    name = str(tds[2].a.contents[2].replace('[', '')).strip()
                    domain = tds[2].a.attrs['href'].strip()
                    domain = ILLEGAL_CHARACTERS_RE.sub(r'', domain)
                    create_time = str(tds[3].string).strip()

                    if tds[4].string is None:
                        end_time = str(tds[4].font.string).strip()
                    else:
                        end_time = str(tds[4].string).strip()

                    if tds[5].a is None:
                        delivery = ''
                    else:
                        delivery = str(tds[5].a.string).strip()

                    app_info = AppInfo(app_id=app_id,
                                       uid=uid,
                                       name=name,
                                       domain=domain,
                                       create_time=create_time,
                                       end_time=end_time,
                                       delivery=delivery)
                    # print(app_info)
                    app_infos.append(app_info)
Пример #31
0
    def export_xlsx(self):
        # 导出成xlsx格式
        # num:保存文件的结尾_num标识,为str类型
        wb = Workbook()
        wb.encoding = f'{self.encoding}'
        ws = wb.active
        ws.title = self.title

        # 获取列名作为标题
        self.conn.cursorclass = pymysql.cursors.DictCursor
        with self.conn.cursor() as cursor:
            cursor.execute(self.sql)
            title = []
            for column_name in cursor.fetchone():
                title.append(column_name)
        ws.append(title)

        # 获取数据,并写入到表格
        # 使用SSCursor
        # Unbuffered Cursor,
        # mainly useful for queries that return a lot of data,
        # or for connections to remote servers over a slow network.
        self.conn.cursorclass = pymysql.cursors.SSCursor
        with self.conn.cursor() as cursor:
            msg = f'正在导出SQL:{self.sql}\n'
            self.execute_log.append(msg)
            pull_msg = {'status': 3, 'data': msg}
            async_to_sync(channel_layer.group_send)(self.user, {"type": "user.message",
                                                                'text': json.dumps(pull_msg)})
            cursor.execute(self.sql)
            while True:
                row = cursor.fetchone()
                if row:
                    # 过滤掉特殊字符
                    filter_illegal_characters_row = list(
                        map(
                            (lambda x: ILLEGAL_CHARACTERS_RE.sub(r'', x) if isinstance(x, str) else x), row
                        )
                    )
                    ws.append(filter_illegal_characters_row)
                else:
                    break
        wb.save(self.file)
        self.compress_file()
Пример #32
0
def sanitize(val):
    if not isinstance(val, text_type):
        return val

    return ILLEGAL_CHARACTERS_RE.sub("", val)