Exemplo n.º 1
0
def seg_train(path=config.train_dev_path):
        wb = load_workbook(path)
        ws = wb['sheet1']
        max_row = ws.max_row
        indexs = list(range(2,max_row+1))
        shuffle(indexs)
        wb_train = Workbook()
        sheet_train = wb_train.create_sheet('sheet1')
        wb_train.remove(wb_train['Sheet'])
        wb_dev = Workbook()
        sheet_dev = wb_dev.create_sheet('sheet1')
        wb_dev.remove(wb_dev['Sheet'])
        for i in range(4):
            sheet_train.cell(1,i+1,ws.cell(1,i+1).value)
            sheet_dev.cell(1,i+1,ws.cell(1,i+1).value)
        mid = len(indexs) // 10 * 8
        train_line = 2
        test_line = 2
        for i in range(len(indexs)):
            if i<mid:
                sheet_train.cell(train_line,1,ws.cell(indexs[i],1).value)
                sheet_train.cell(train_line,2,ws.cell(indexs[i],2).value)
                sheet_train.cell(train_line,3,ws.cell(indexs[i],3).value)
                sheet_train.cell(train_line,4,ws.cell(indexs[i],4).value)
                train_line+=1
            else:
                sheet_dev.cell(test_line,1,ws.cell(indexs[i],1).value)
                sheet_dev.cell(test_line,2,ws.cell(indexs[i],2).value)
                sheet_dev.cell(test_line,3,ws.cell(indexs[i],3).value)
                sheet_dev.cell(test_line,4,ws.cell(indexs[i],4).value)
                test_line+=1
        wb_train.save('./sub_train.xlsx')
        wb_dev.save('./sub_dev.xlsx')
        logger.info("Finished seg train data")
Exemplo n.º 2
0
def get_all_vocab():
    vocab_train_path = './task2_vocab.txt'
    vocab_val_path = './task2_vocab.val.txt'
    train_data_path = './task2_train_reformat.xlsx'
    all_cocab_path = './all_vocab.txt'
    vocab_list = []
    for path in [vocab_train_path, vocab_val_path]:
        with open(path, 'r', encoding='utf-8') as vocab:
            for x in vocab.readlines():
                vocab_list.append(x[:-1].replace('_x0004_', '').replace(' ', ''))
    wb = load_workbook(train_data_path)
    ws = wb['sheet1']
    max_row = ws.max_row
    for i in range(max_row-1):
        line = i+2
        if ws.cell(line, 2).value is not None:
            places = ws.cell(line, 2).value.split(',')
            for place in places:
                vocab_list.append(place.replace('_x0004_', '').replace(' ', ''))
        if ws.cell(line, 4).value is not None:
            places = ws.cell(line, 4).value.split(',')
            for place in places:
                vocab_list.append(place.replace('_x0004_', '').replace(' ', ''))
    vocab_list = list(set(vocab_list))
    with open(all_cocab_path, 'w', encoding='utf-8') as all_cocab_file:
        for place in vocab_list:
            all_cocab_file.write(place+'\n')
    logger.info('all_vocab写入完成')
Exemplo n.º 3
0
async def run_api_server():
    start_http_server(8001)
    logger.info("run api_server")
    runner = web.AppRunner(api_server)
    await runner.setup()
    site = web.TCPSite(runner, '0.0.0.0', 8000)
    await site.start()
Exemplo n.º 4
0
def spider_main(search_list, use_cache=False):
    if use_cache:
        proxy_df = SpiderProxy.read_csv()
    else:
        pxy = SpiderProxy()
        # pxy.spider_proxy360()
        pxy.spider_xicidaili()
        pxy.check_proxy()
        pxy.save_csv()
        proxy_df = pxy.proxy_df

    if (len(proxy_df) <= 0):
        logger.info('代理失效,结束')
        return
    """
        由于代理太慢暂时每次只启动三个进程
    """
    n_jobs = PROCESS_MAX_COUNT
    if g_enable_debug:
        n_jobs = 1
    parallel = Parallel(n_jobs=n_jobs, verbose=0, pre_dispatch='2*n_jobs')

    parallel(
        delayed(do_spider_parallel)(proxy_df, ind, search_name)
        for ind, search_name in enumerate(search_list))
Exemplo n.º 5
0
def sub_text_more(file='train'):
    if file == 'train':
        path = './sub_train.xlsx'
        save_path = './sub_cut_train.xlsx'
    else:
        path = './sub_dev.xlsx'
        save_path = './sub_cut_dev.xlsx'
    wb = load_workbook(path)
    ws = wb['sheet1']
    max_row = ws.max_row
    wb1 = Workbook()
    ws1 = wb1.create_sheet('sheet1')
    wb1.remove(wb1['Sheet'])
    names = ['原文', '原发部位', '病灶大小', '转移部位']
    for i in range(len(names)):
        ws1.cell(1, i + 1, names[i])
    all_text = []
    all_origin = []
    all_size = []
    all_trans = []
    for i in range(max_row-1):
        line = i+2
        text = ws.cell(line,1).value
        texts = tool.split_text(text)
        all_text.extend(texts)
        for j in range(3):
            if ws.cell(line,j+2).value is not None:
                places = ws.cell(line,j+2).value.split(',')
                for t in texts:
                    place_in_text = []
                    for place in places:
                        if place in t:
                            place_in_text.append(place)
                    if j==0:
                        all_origin.append(','.join(place_in_text))
                    elif j==1:
                        all_size.append(','.join(place_in_text))
                    else:
                        all_trans.append(','.join(place_in_text))
            else:
                for t in texts:
                    if j==0:
                        all_origin.append('')
                    elif j==1:
                        all_size.append('')
                    else:
                        all_trans.append('')
    assert len(all_trans) == len(all_size) and len(all_trans) == len(all_origin), 'len(all_trans) != len(all_size) or len(all_trans) != len(all_origin)'
    for i in range(len(all_text)):
        line = i+2
        ws1.cell(line,1,all_text[i])
        ws1.cell(line,2,all_origin[i])
        ws1.cell(line,3,all_size[i])
        ws1.cell(line,4,all_trans[i])
    wb1.save(save_path)
    logger.info('Finished cut {}.xlsx'.format(file))
Exemplo n.º 6
0
    def _do_collect_work(self):
        with ThreadPoolExecutor(max_workers=len(self.back_proxys) *
                                3) as executor:
            """
                这里使用线程池还是因为代理的质量太差了, 要控制线程数量
            """
            thread_lock = threading.RLock()
            all_same_cnt = 0
            while True:
                soup = BeautifulSoup(self.driver.page_source, "lxml")
                img_objs = soup.select('#imgid > div > ul > li[data-objurl]')

                sub_same_cnt = 0
                for img in img_objs:
                    url = img['data-objurl']
                    url_thumb = img['data-thumburl']
                    if self.requested_url.count(url) > 0:
                        sub_same_cnt += 1
                        continue

                    url_dict = {'url': url, 'url_thumb': url_thumb}
                    if g_enable_debug:
                        self.down_load_img(url_dict, thread_lock)
                    else:
                        executor.submit(self.down_load_img, url_dict,
                                        thread_lock)

                    # 就在这里append否则里面还要线程同步
                    self.requested_url.append(url)

                js = "window.scrollTo({}, {})".format(
                    self.current_pos, self.current_pos + MOVE_STEPS)
                self.current_pos += MOVE_STEPS
                self.driver.execute_script(js)
                time.sleep(MOVE_SLEEP_TIME)
                """
                    所有都在requested中记录全等一次否则重置
                """
                if sub_same_cnt == len(img_objs):
                    all_same_cnt += 1
                else:
                    all_same_cnt = 0
                """
                    达到一定次数,认为到底部了
                """
                if all_same_cnt > 30:
                    print '[Process] 图片流 结束,已下载图片数目=%d' % (self.collect_cnt)
                    break

                if self.collect_cnt >= IMAGE_MAX_COUNT:
                    logger.info('collect_cnt > %d task end' %
                                (IMAGE_MAX_COUNT))

                    print '[Process] 下载图片数目 超过 %d' % (IMAGE_MAX_COUNT)
                    break
Exemplo n.º 7
0
def sub_text(file='train'):
    if file == 'train':
        path = './sub_train.xlsx'
        save_path = './sub_cut_train.xlsx'
    else:
        path = './sub_dev.xlsx'
        save_path = './sub_cut_dev.xlsx'
    wb = load_workbook(path)
    ws = wb['sheet1']
    max_row = ws.max_row
    wb1 = Workbook()
    wb2 = Workbook()
    ws1 = wb1.create_sheet('sheet1')
    wb1.remove(wb1['Sheet'])
    ws2 = wb2.create_sheet('sheet1')
    wb2.remove(wb2['Sheet'])
    names = ['原文', '原发部位', '病灶大小', '转移部位']
    for i in range(len(names)):
        ws1.cell(1, i + 1, names[i])
        ws2.cell(1, i + 1, names[i])
    line_1_2 = 2
    for i in tqdm(range(max_row - 1)):
        p = [[] for i in range(6)]
        line = i + 2
        text = ws.cell(line,1).value
        middle = ws.cell(line,1).value.find('。')
        text1 = text[:middle+1]
        text2 = text[middle+1:]
        for i in range(3):
            if ws.cell(line,i+2).value is not None:
                places = ws.cell(line,i+2).value.split(',')
                for place in places:
                    if i==1:
                        p[2*i].append(place)
                        p[2 * i + 1].append(place)
                    else:
                        if place in text1:
                            p[2*i].append(place)
                        if place in text2:
                            p[2*i+1].append(place)
            else:
                p[2 * i]=''
                p[2 * i + 1]=''
        ws1.cell(line_1_2,1,text1)
        ws1.cell(line_1_2+1,1,text2)
        ws1.cell(line_1_2,2,','.join(p[0]))
        ws1.cell(line_1_2+1,2,','.join(p[1]))
        ws1.cell(line_1_2,3,','.join(p[2]))
        ws1.cell(line_1_2+1,3,','.join(p[3]))
        ws1.cell(line_1_2,4,','.join(p[4]))
        ws1.cell(line_1_2+1,4,','.join(p[5]))
        line_1_2+=2
    wb1.save(save_path)
    logger.info('Finished cut {}.xlsx'.format(file))
Exemplo n.º 8
0
    def do_thread_work(self, proxy, checked_list, thread_lock):
        if proxy['type'] == 'HTTP':
            proxy_dict = dict(http='http://{}'.format(proxy['proxy']),
                              https='http://{}'.format(proxy['proxy']))
        else:
            proxy_dict = dict(http='socks5://{}'.format(proxy['proxy']),
                              https='socks5://{}'.format(proxy['proxy']))

        try:
            # r = requests.post("https://www.baidu.com/", headers=self.headers, proxies=proxy_dict, timeout=15,
            #                   verify=False)
            img_url = 'http://picm.bbzhi.com/dongwubizhi/labuladuoxunhuiquanbizhi/animal_' \
                      'labrador_retriever_1600x1200_44243_m.jpg'

            enable_stream = False
            if enable_stream:
                response = requests.get(img_url,
                                        headers=self.headers,
                                        proxies=proxy_dict,
                                        timeout=15,
                                        stream=True)
                if response.status_code == 200:
                    test_name = '../gen/check_proxy.jpg'
                    with open(test_name, 'wb') as f:
                        for chunk in response.iter_content(chunk_size=1024):
                            if chunk:
                                f.write(chunk)
                                f.flush()

                        check_img = PIL.Image.open(test_name)
                        check_img.close()
            else:
                response = requests.get(img_url,
                                        headers=self.headers,
                                        proxies=proxy_dict,
                                        timeout=(10, 20))
                if response.status_code == 200:
                    test_name = '../gen/check_proxy.jpg'
                    with open(test_name, 'wb') as f:
                        f.write(response.content)
                        f.flush()
                    check_img = PIL.Image.open(test_name)
                    check_img.close()
        except Exception as e:
            # logger.exception(e)
            return
        with thread_lock:
            logger.info('{} check ok'.format(proxy['proxy']))
            checked_list.append(proxy)
Exemplo n.º 9
0
async def verify_error_proxy_task():
    logger.info("run verify_error_proxy_task")
    s = sess_maker()
    c = s.query(Proxy).filter(Proxy.status == STATUS_OK).count()
    s.close()
    if c < VERIFY_ERROR_LIMIT:
        await verify_error_proxy()

    s = sess_maker()
    c = s.query(Proxy).filter(Proxy.status == STATUS_ERROR).count()
    if c > MAX_ERROR_PROXIES:
        res = s.query(Proxy).filter(Proxy.status == STATUS_ERROR).order_by(
            asc(Proxy.updated_at)).limit(c -
                                         MAX_ERROR_PROXIES).from_self().all()
        [s.delete(i) for i in res]
    s.commit()
Exemplo n.º 10
0
    def check_proxy(self):
        checked_list = list()
        thread_lock = threading.RLock()
        thread_array = []
        for proxy in self.proxy_list:
            # self.do_thread_work(proxy, checked_list, thread_lock)
            t = threading.Thread(target=self.do_thread_work,
                                 args=(
                                     proxy,
                                     checked_list,
                                     thread_lock,
                                 ))
            t.setDaemon(True)
            t.start()
            thread_array.append(t)

        for t in thread_array:
            t.join()

        self.proxy_list = checked_list
        logger.info('proxy_list len={}'.format(len(self.proxy_list)))
Exemplo n.º 11
0
 def createMenu(self, accessToken):
     logger.info("createMenu in...........")
     redirect_uri1 = 'http://www.dabooster.com/sport/reserve_middleware_wx.html?query=yes'
     redirect_uri2 = 'http://www.dabooster.com/sport/reserve_middleware_wx.html?query=no'
     redirect_uri3 = 'http://www.dabooster.com/sport/personal_center_wx.html'
     url1 = "https://open.weixin.qq.com/connect/oauth2/authorize?appid=" + appid + "&redirect_uri=" + redirect_uri1 + "&response_type=code&scope=snsapi_base&state=OK#wechat_redirect" 
     url2 = "https://open.weixin.qq.com/connect/oauth2/authorize?appid=" + appid + "&redirect_uri=" + redirect_uri2 + "&response_type=code&scope=snsapi_base&state=OK#wechat_redirect" 
     url3 = "https://open.weixin.qq.com/connect/oauth2/authorize?appid=" + appid + "&redirect_uri=" + redirect_uri3 + "&response_type=code&scope=snsapi_base&state=OK#wechat_redirect"
     menu = '''{
              "button":[
                  { 
                      "name":"在线预定",
                      "sub_button":[
                           {"type":"view",
                           "name":"场地预定",
                           "url":"http://www.dabooster.com/sport/reserve_wx.html?appid=wx95198705de430c74"},
                            {"type":"view",
                           "name":"我的订单",
                           "url":"%s"}                              
                      ]
                 },
                 {
                       "type":"view",
                       "name":"活动",
                       "url":"%s"
                 },
                  { 
                       "type":"view",
                       "name":"个人中心",
                       "url":"%s"
                 }
                      
                 }
             ]}'''%(url1,url2,url3)
     html = urllib2.urlopen(self.createUrl + accessToken, menu.encode("utf-8"))
     result = json.loads(html.read().decode("utf-8"))
     
     logger.info("html=" + str(result))
     
     return result["errcode"]
Exemplo n.º 12
0
    def get_data(self):
        db = Database()
        db.connect()
        for tab in self.tab_list:
            #因为mysql的命名不能用「-」
            if '-' in tab:
                temp_tab = tab.replace('-', '_')
            else:
                temp_tab = tab
            table_name = 'etf_' + temp_tab
            url = self.root_url
            first_body = {
                "tab": tab,
                "only": ["meta", "data"],
            }
            re = requests.post(url,
                               headers=self.header,
                               data=json.dumps(first_body)).json()
            #meta 表示返回的数据概要
            total_pages = re['meta']['total_pages']
            #把第一次返回的内容写进数据库
            logger.info("{} has total {} pages!!".format(tab, total_pages))
            logger.info("getting {} page {} data".format(tab, '1'))
            self.spilt_data_by_name_and_insert(re['data'], db, table_name)

            for i in range(2, total_pages + 1):
                logger.info("getting data of {}--page {} / {}".format(
                    tab, str(i), str(total_pages)))
                payload = {
                    "page": str(i),
                    "tab": tab,
                    "only": ["meta", "data"],
                }
                re = requests.post(url,
                                   headers=self.header,
                                   data=json.dumps(payload)).json()
                self.spilt_data_by_name_and_insert(re['data'], db, table_name)
        db.close()
Exemplo n.º 13
0
 async def wrapper(*args, **kw):
     res = await fn(*args, **kw)
     logger.info(f"run spider {fn.__name__} get {len(res)} result")
     return res
Exemplo n.º 14
0
 def train(self):
     max_f1 = -1
     max_dict = {}
     max_report = {}
     label_report = {}
     loss_list = []
     f1_list = []
     epoch_list = []
     if not os.path.exists('./result/classification_report/{}'.format(
             self.config.experiment_name)):
         os.mkdir('./result/classification_report/{}'.format(
             self.config.experiment_name))
         os.mkdir('./result/picture/{}'.format(self.config.experiment_name))
         os.mkdir('./result/data/{}'.format(self.config.experiment_name))
         os.mkdir('./result/data/{}/test_format'.format(
             self.config.experiment_name))
     logger.info('Loading data ...')
     train_data = self.tool.load_data(self.config.train_path,
                                      self.config.is_bioes)
     dev_data = self.tool.load_data(self.config.dev_path,
                                    self.config.is_bioes)
     logger.info('Finished load data')
     logger.info('Building vocab ...')
     if self.config.is_pretrained_model:
         with open(self.config.pretrained_vocab, 'r',
                   encoding='utf-8') as vocab_file:
             vocab_list = vocab_file.readlines()
         if self.config.model_name == 'FLAT':
             self.bigram_vocab = self.tool.get_bigram_vocab(
                 train_data, dev_data)
             self.lattice_vocab = self.tool.get_text_vocab(
                 train_data, dev_data)
         else:
             self.word_vocab = self.tool.get_text_vocab(vocab_list)
     else:
         if self.config.model_name == 'FLAT':
             self.bigram_vocab = self.tool.get_bigram_vocab(
                 train_data, dev_data)
             self.lattice_vocab = self.tool.get_text_vocab(
                 train_data, dev_data)
         else:
             self.word_vocab = self.tool.get_text_vocab(
                 train_data, dev_data)
     if self.config.model_name == 'FLAT':
         vectors = self.lattice_vocab.vectors
     else:
         vectors = self.word_vocab.vectors
     self.tag_vocab = self.tool.get_tag_vocab(train_data, dev_data)
     logger.info('Finished build vocab')
     if self.config.is_hidden_tag:
         self.hidden_tag_vocab = self.tool.get_hidden_tag_vocab(
             train_data, dev_data)
         model = self.init_model(self.config,
                                 len(self.word_vocab),
                                 len(self.tag_vocab),
                                 len(self.hidden_tag_vocab),
                                 vectors=vectors,
                                 n_bigram=None)
     elif self.config.model_name == 'FLAT':
         model = self.init_model(self.config,
                                 len(self.bigram_vocab),
                                 len(self.lattice_vocab),
                                 len(self.tag_vocab),
                                 vectors=vectors,
                                 n_bigram=None)
     else:
         model = self.init_model(self.config,
                                 len(self.word_vocab),
                                 len(self.tag_vocab),
                                 None,
                                 vectors=vectors,
                                 n_bigram=None)
     # model.load_state_dict(torch.load(self.config.model_path.format(self.config.experiment_name)))
     self.model = model
     logger.info('Building iterator ...')
     train_iter = self.tool.get_iterator(train_data,
                                         batch_size=self.config.batch_size)
     dev_iter = self.tool.get_iterator(dev_data,
                                       batch_size=self.config.batch_size)
     logger.info('Finished build iterator')
     optimizer = optim.Adam(model.parameters(),
                            lr=self.config.learning_rate,
                            weight_decay=1e-5)
     logger.info('Begining train ...')
     for epoch in range(self.config.epoch):
         model.train()
         acc_loss = 0
         dice_loss = 0
         for index, iter in enumerate(tqdm(train_iter)):
             if iter.tag.shape[1] == self.config.batch_size:
                 optimizer.zero_grad()
                 if self.config.model_name == 'FLAT':
                     bigiam = iter.bigram[0]
                     lattice = iter.lattice[0]
                     lattice_len = iter.lattice[1]
                     tag = iter.tag
                     loss = model.loss(bigiam, lattice, lattice_len, tag)
                 else:
                     text = iter.text[0]
                     tag = iter.tag
                     text_len = iter.text[1]
                     if self.config.is_hidden_tag:
                         hidden_tag = iter.hidden_tag
                         loss = model.loss(text, text_len, tag, hidden_tag)
                     else:
                         loss, dice = model.loss(text, text_len, tag)
                 acc_loss += loss.view(-1).cpu().data.tolist()[0]
                 dice_loss += dice.view(-1).cpu().data.tolist()[0]
                 loss.backward()
                 optimizer.step()
         f1, report_dict, entity_prf_dict = self.eval(dev_iter)
         loss_list.append(acc_loss)
         # f1 = report_dict['weighted avg']['f1-score']
         f1_list.append(f1)
         epoch_list.append(epoch + 1)
         logger.info('dice_loss:{}'.format(dice_loss))
         logger.info('epoch:{}   loss:{}   weighted avg:{}'.format(
             epoch, acc_loss, report_dict['weighted avg']))
         if f1 > max_f1:
             max_f1 = f1
             label_report = report_dict['weighted avg']
             max_dict = entity_prf_dict['average']
             max_report = entity_prf_dict
             torch.save(
                 model.state_dict(),
                 './save_model/{}.pkl'.format(self.config.experiment_name))
             logger.info(
                 'The best model saved has entity-f1:{}   label-f1:{}'.
                 format(max_f1, label_report['f1-score']))
     logger.info('Finished train')
     logger.info('Max_f1 avg : {}'.format(max_dict))
     # with codecs.open('./result/classification_report/{}/pred_info.txt'.format(config.experiment_name), 'w',
     #                  encoding='utf-8') as f:
     #     f.write(max_dict+ '\n' + label_report)
     self.tool.write_csv(max_report, label_report)
     self.tool.show_1y(epoch_list, loss_list, 'loss')
     self.tool.show_1y(epoch_list, f1_list, 'f1')
Exemplo n.º 15
0
 def write_val_true_pred(self, path=None, model_name=None, save_path=None):
     if path is None:
         model_name = self.config.model_path.format(
             self.config.experiment_name)
         save_path = self.config.analysis_path.format(
             self.config.experiment_name)
     train_data = self.tool.load_data(self.config.train_path,
                                      self.config.is_bioes)
     dev_data = self.tool.load_data(self.config.dev_path,
                                    self.config.is_bioes)
     logger.info('Finished load data')
     logger.info('Building vocab ...')
     if self.config.is_pretrained_model:
         with open(self.config.pretrained_vocab, 'r',
                   encoding='utf-8') as vocab_file:
             vocab_list = vocab_file.readlines()
         word_vocab = self.tool.get_text_vocab(vocab_list)
     else:
         if self.config.model_name == 'FLAT':
             bigram_vocab = self.tool.get_bigram_vocab(train_data, dev_data)
             lattice_vocab = self.tool.get_text_vocab(train_data, dev_data)
         else:
             word_vocab = self.tool.get_text_vocab(train_data, dev_data)
     # vectors = lattice_vocab.vectors
     vectors = None
     tag_vocab = self.tool.get_tag_vocab(train_data, dev_data)
     logger.info('Finished build vocab')
     if self.config.is_hidden_tag:
         self.hidden_tag_vocab = self.tool.get_hidden_tag_vocab(
             train_data, dev_data)
         model = self.init_model(self.config,
                                 len(word_vocab),
                                 len(tag_vocab),
                                 len(self.hidden_tag_vocab),
                                 vectors=vectors)
     elif self.config.model_name == 'FLAT':
         model = self.init_model(self.config,
                                 len(bigram_vocab),
                                 len(lattice_vocab),
                                 len(tag_vocab),
                                 vectors=vectors,
                                 n_bigram=None)
     else:
         model = self.init_model(self.config,
                                 len(word_vocab),
                                 len(tag_vocab),
                                 None,
                                 vectors=vectors)
     model.load_state_dict(torch.load(model_name))
     # 需要新建xlsx 七列
     wb_analysis = Workbook()
     analysis_sheet = wb_analysis.create_sheet('sheet1')
     wb_analysis.remove(wb_analysis['Sheet'])
     names = [
         '原文', '原发部位', '病灶大小', '转移部位', 'pred_原发部位', 'pred_病灶大小', 'pred_转移部位'
     ]
     for i in range(len(names)):
         analysis_sheet.cell(1, i + 1, names[i])
     wb = load_workbook(filename=config.analysis_dev_path)
     ws = wb['sheet1']
     max_row = ws.max_row
     false_fill = PatternFill(fill_type='solid', fgColor='FFC125')
     for line_num in tqdm(range(max_row - 1)):
         line_num += 2
         sentence = ws.cell(line_num, 1).value
         # index_size = {}
         # chars = ['.', '*', '×', 'X', 'x', 'c', 'C', 'm', 'M']
         # starts = []
         # ends = []
         # i = 0
         # while i < len(sentence):
         #     if sentence[i] in chars or sentence[i].isdigit():
         #         S_start = i
         #         while i + 1 < len(sentence) and (sentence[i + 1] in chars or sentence[i + 1].isdigit()):
         #             i += 1
         #         if sentence[S_start:i + 1].__contains__('M') or sentence[S_start:i + 1].__contains__('m'):
         #             starts.append(S_start)
         #             ends.append(i)
         #         i += 1
         #     else:
         #         i += 1
         # sentence.replace('$', '')
         # new_sentence = [c for c in sentence]
         # width = 0
         # if len(starts) != 0:
         #     for i in range(len(starts)):
         #         start_i = starts[i] - width
         #         index_size[start_i] = sentence[starts[i]:ends[i] + 1]
         #         for j in range(ends[i] - starts[i]):
         #             del new_sentence[start_i]
         #         new_sentence[start_i] = '$'
         #         width += ends[i] - starts[i]
         #         a = 0
         # sentence = ''.join(new_sentence)
         sentence1 = []
         tag_pred = []
         if self.config.model_name == 'FLAT':
             texts = self.tool.split_text(sentence)
             for text in texts:
                 sentence1.extend(text)
                 bigram1 = get_bigram(text)
                 bigram = torch.tensor(
                     numpy.array([bigram_vocab.stoi[bi] for bi in bigram1],
                                 dtype='int64')).unsqueeze(1).expand(
                                     len(bigram1),
                                     self.config.batch_size).to(device)
                 lattice1 = list(text) + w_trie.get_lexicon(text)
                 lattice = torch.tensor(
                     numpy.array(
                         [lattice_vocab.stoi[word] for word in lattice1],
                         dtype='int64')).unsqueeze(1).expand(
                             len(lattice1),
                             self.config.batch_size).to(device)
                 lattice_len = torch.tensor(
                     numpy.array([len(lattice1)], dtype='int64')).expand(
                         self.config.batch_size).to(device)
                 result = model(bigram, lattice, lattice_len)[0]
                 for k in result:
                     tag_pred.append(tag_vocab.itos[k])
         else:
             texts = self.tool.split_text(sentence)
             for text in texts:
                 sentence1.extend(text)
                 text = torch.tensor(
                     numpy.array([word_vocab.stoi[word] for word in text],
                                 dtype='int64')).unsqueeze(1).expand(
                                     len(text),
                                     self.config.batch_size).to(device)
                 text_len = torch.tensor(
                     numpy.array([len(text)], dtype='int64')).expand(
                         self.config.batch_size).to(device)
                 result = model(text, text_len)[0]
                 for k in result:
                     tag_pred.append(tag_vocab.itos[k])
         sentence1 = ''.join(sentence1)
         i = 0
         origin_places = []
         sizes = []
         transfered_places = []
         while i < len(tag_pred):
             if self.config.is_bioes:
                 start = end = 0
                 if tag_pred[i][:1] == 'B':
                     kind = tag_pred[i][2:]
                     start = i
                     end = i
                     while end + 1 < len(sentence1) and (
                             tag_pred[end + 1][0] == 'I'
                             or tag_pred[end + 1][0]
                             == 'E') and tag_pred[end + 1][2:] == kind:
                         end += 1
                     if kind == 'origin_place':
                         origin_places.append(sentence1[start:end + 1])
                     elif kind == 'size':
                         sizes.append(sentence1[start:end + 1])
                     else:
                         transfered_places.append(sentence1[start:end + 1])
                     i = end + 1
                 elif tag_pred[i][:1] == 'E':
                     kind = tag_pred[i][2:]
                     start = i
                     end = i
                     if kind == 'origin_place':
                         origin_places.append(sentence1[start:end + 1])
                     elif kind == 'size':
                         # sizes.append(index_size[start])
                         sizes.append(sentence1[start:end + 1])
                     else:
                         transfered_places.append(sentence1[start:end + 1])
                     i += 1
                 else:
                     i += 1
             else:
                 start = end = 0
                 if tag_pred[i][:1] == 'B':
                     kind = tag_pred[i][2:]
                     start = end = i
                     while end + 1 < len(sentence1) and tag_pred[
                             end + 1][0] == 'I' and tag_pred[end +
                                                             1][2:] == kind:
                         end += 1
                     if kind == 'origin_place':
                         origin_places.append(sentence1[start:end + 1])
                     elif kind == 'size':
                         # sizes.append(index_size[start])
                         sizes.append(sentence1[start:end + 1])
                     else:
                         transfered_places.append(sentence1[start:end + 1])
                     i = end + 1
                 else:
                     i += 1
             # if tag_pred[i]!='O':
             #     start = i
             #     kind = tag_pred[i][2:]
             #     while i+1<len(tag_pred) and tag_pred[i+1][2:]==kind:
             #         i+=1
             #     end = i + 1
             #     if kind == 'origin_place':
             #         origin_places.append(sentence1[start:end])
             #     elif kind == 'size':
             #         sizes.append(index_size[start])
             #     else:
             #         transfered_places.append(sentence1[start:end])
             # i+=1
         for places in [origin_places, sizes, transfered_places]:
             for place in places:
                 if place == []:
                     places.remove(place)
         analysis_sheet.cell(line_num, 1, ws.cell(line_num, 1).value)
         analysis_sheet.cell(line_num, 2, ws.cell(line_num, 2).value)
         analysis_sheet.cell(line_num, 3, ws.cell(line_num, 3).value)
         analysis_sheet.cell(line_num, 4, ws.cell(line_num, 4).value)
         analysis_sheet.cell(line_num, 5,
                             ','.join(list(set(origin_places))))
         analysis_sheet.cell(line_num, 6, ','.join(list(set(sizes))))
         analysis_sheet.cell(line_num, 7,
                             ','.join(list(set(transfered_places))))
         for i in range(2, 5):
             if analysis_sheet.cell(line_num, i).value != None:
                 if analysis_sheet.cell(line_num, i + 3).value == None:
                     analysis_sheet.cell(line_num, i).fill = false_fill
                     analysis_sheet.cell(line_num, i + 3).fill = false_fill
                 else:
                     flag = False
                     s1 = analysis_sheet.cell(line_num, i).value.split(',')
                     s2 = analysis_sheet.cell(line_num,
                                              i + 3).value.split(',')
                     for x in s2:
                         if x not in s1:
                             flag = True
                             break
                     if flag:
                         analysis_sheet.cell(line_num, i).fill = false_fill
                         analysis_sheet.cell(line_num,
                                             i + 3).fill = false_fill
     wb_analysis.save(save_path)
     logger.info('Finished Predicting...')
Exemplo n.º 16
0
async def verify_ok_proxy_task():
    logger.info("run verify_ok_proxy_task")
    await verifier.verify_ok_proxy()
    await verify_error_proxy_task()
    await update_squid_task()
Exemplo n.º 17
0
    await verifier.verify_ok_proxy()
    await verify_error_proxy_task()
    await update_squid_task()


@cron_wait
async def fetch_new_proxy_task():
    logger.info("run fetch_new_proxy_task")
    await spider.run_spider()
    await verifier.verify_new_proxy()
    # await verify_error_proxy_task()
    await update_squid_task()


if __name__ == '__main__':
    logger.info("start")

    loop = asyncio.get_event_loop()
    loop.run_until_complete(update_squid_task())

    msh = Scheduler()
    msh.add_job(CronJob().every(10).minute.go(verify_ok_proxy_task))
    msh.add_job(CronJob().every(30).minute.go(fetch_new_proxy_task))
    try:
        loop.run_until_complete(asyncio.wait([
            msh.start(),
            run_api_server(),
        ]))
        loop.run_forever()
    except KeyboardInterrupt:
        print('exit')
Exemplo n.º 18
0
def data_clean(path='./task2_train_reformat{}.xlsx'):
    wb = load_workbook(path.format(''))
    ws = wb['sheet1']
    max_row = ws.max_row
    wb1 = Workbook()
    ws1 = wb1.create_sheet('sheet1')
    wb1.remove(wb1['Sheet'])
    names = ['原文', '原发部位', '病灶大小', '转移部位']
    for i in range(len(names)):
        ws1.cell(1, i + 1, names[i])
    place_num = 0
    size_num = 0
    for i in range(max_row - 1):
        line = i + 2
        new_sentence = ''
        chars = ['.','*','×','X','x','c','C','m','M',' ']
        # o_chars = ['_x0004_', '�', ':', ',', ';']
        # t_chars = ['', '', ':', ',', ';']
        o_chars = ['�']
        t_chars = ['']
        for i in range(4):
            if i==0:
                if '检测值' in ws.cell(line,i+1).value:
                    new_sentence = ws.cell(line, i + 1).value
                    for i in range(len(o_chars)):
                        new_sentence = new_sentence.replace(o_chars[i], t_chars[i])
                else:
                    new_sentence = ws.cell(line, i+1).value.replace(' ', '')
                    for i in range(len(o_chars)):
                        new_sentence = new_sentence.replace(o_chars[i], t_chars[i])
                i = 0
                j = 0
                while j < len(new_sentence):
                    while j < len(new_sentence) and not new_sentence[j].isdigit():
                        j+=1
                    start = j
                    end = start
                    while end+1<len(new_sentence) and (new_sentence[end+1] in chars or new_sentence[end+1].isdigit()):
                        end+=1
                    if new_sentence[start:end+1].__contains__('m') or new_sentence[start:end+1].__contains__('M'):
                        old_size = new_sentence[start:end+1]
                        new_size = ''
                        nums = ''
                        k=start
                        flag = False
                        while k<=end:
                            while new_sentence[k].isdigit() or new_sentence[k]=='.':
                                nums+=new_sentence[k]
                                k+=1
                                flag=True
                            if flag:
                                nums+=','
                                flag=False
                            k+=1
                        nums = nums[:-1].split(',')
                        if old_size.__contains__('c') or old_size.__contains__('C'):
                            for num in nums:
                                new_size = new_size+num+'CM'+'×'
                            new_size = new_size[:-1]
                        else:
                            for num in nums:
                                new_size = new_size+num+'MM'+'×'
                            new_size = new_size[:-1]
                        j=end
                        new_sentence = new_sentence.replace(old_size,new_size)
                    j+=1
                ws1.cell(line,i+1,new_sentence)
            elif i==1 or i==3:
                places = ws.cell(line,i+1).value
                if places is not None:
                    places = places.replace('_x0004_', '').replace(' ', '')
                    for place in places.split(','):
                        if place not in new_sentence:
                            place_num+=1
                            print('sentence中未找到place,次数{}  {}  {}'.format(place_num, new_sentence, place))
                    ws1.cell(line, i + 1, places)
                else:
                    ws1.cell(line, i + 1, '')
            else:
                sizes = ws.cell(line,i+1).value
                if sizes is not None:
                    sizes = sizes.replace('_x0004_', '').replace(' ', '').replace('�', '')
                    sizes_clean = ''
                    sizes_list = sizes.split(',')
                    for j in range(len(sizes_list)):
                        size = re.findall(r"\d+\.?\d*",sizes_list[j])
                        if sizes_list[j].__contains__('c') or sizes_list[j].__contains__('C'):
                            for k in range(len(size)):
                                sizes_clean = sizes_clean + size[k] + 'CM' + '×'
                            sizes_clean = sizes_clean[:-1]+','
                        else:
                            for k in range(len(size)):
                                sizes_clean = sizes_clean + size[k] + 'MM' + '×'
                            sizes_clean = sizes_clean[:-1] + ','
                    sizes_clean = sizes_clean[:-1]
                    for size in sizes_clean.split(','):
                        if size not in new_sentence:
                            size_num += 1
                            print('sentence中未找到place,次数{}  {}  {}'.format(size_num, new_sentence, size))
                    ws1.cell(line, i + 1, sizes_clean)
    wb1.save(path.format('_cleaned'))
    logger.info('Finished cleaned data')
Exemplo n.º 19
0
def sub_text_condition(file='train'):
    if file == 'train':
        path = './sub_train.xlsx'
        save_path = './sub_cut_train1.xlsx'
    else:
        path = './sub_dev.xlsx'
        save_path = './sub_cut_dev1.xlsx'
    wb = load_workbook(path)
    ws = wb['sheet1']
    max_row = ws.max_row
    wb1 = Workbook()
    ws1 = wb1.create_sheet('sheet1')
    wb1.remove(wb1['Sheet'])
    names = ['原文', '原发部位', '病灶大小', '转移部位']
    for i in range(len(names)):
        ws1.cell(1, i + 1, names[i])
    all_text = []
    all_origin = []
    all_size = []
    all_trans = []
    for i in range(max_row-1):
        line = i+2
        text = ws.cell(line,1).value
        texts = tool.split_text(text)
        all_text.extend(texts)
        if ws.cell(line,4).value is not None and text.__contains__('转移'):
            places = ws.cell(line,4).value.split(',')
            for t in texts:
                place_in_text = []
                for place in places:
                    if place in t and t.__contains__('转移'):
                        place_in_text.append(place)
                all_trans.append(','.join(place_in_text))
        elif ws.cell(line,4).value is not None and not text.__contains__('转移'):
            places = ws.cell(line, 4).value.split(',')
            for t in texts:
                place_in_text = []
                for place in places:
                    if place in t :
                        place_in_text.append(place)
                all_trans.append(','.join(place_in_text))
        elif ws.cell(line,4).value is None:
            for t in texts:
                all_trans.append('')
        for j in range(2):
            if ws.cell(line,j+2).value is not None:
                places = ws.cell(line,j+2).value.split(',')
                for t in texts:
                    place_in_text = []
                    for place in places:
                        if place in t:
                            place_in_text.append(place)
                    if j==0:
                        all_origin.append(','.join(place_in_text))
                    elif j==1:
                        all_size.append(','.join(place_in_text))
                    else:
                        all_trans.append(','.join(place_in_text))
            else:
                for t in texts:
                    if j==0:
                        all_origin.append('')
                    elif j==1:
                        all_size.append('')
                    else:
                        all_trans.append('')
        # line = i+2
        # text = ws.cell(line, 1).value
        # describe, conclusion = tool.split_describe_conclusion(text)
        # origin = ws.cell(line, 2).value
        # size = ws.cell(line, 3).value
        # tran = ws.cell(line, 4).value
        # if conclusion is not None:
        #     conclusions = tool.split_text(conclusion)
        #     all_text.extend(conclusions)
        #     if origin is not None :
        #         origins = origin.split(',')
        #         for text in conclusions:
        #             place_in_text = []
        #             for place in origins:
        #                 if place in text:
        #                     place_in_text.append(place)
        #             all_origin.append(''.join(place_in_text))
        #     else:
        #         for i in range(len(conclusions)):
        #             all_origin.append('')
        #     if tran is not None:
        #         trans = tran.split(',')
        #         for text in conclusions:
        #             place_in_text = []
        #             for place in trans:
        #                 if place in text and place.__contains__('转移'):
        #                     place_in_text.append(place)
        #             all_trans.append(','.join(place_in_text))
        #     else:
        #         for i in range(len(conclusions)):
        #             all_trans.append('')
        #     for i in range(len(conclusions)):
        #         all_size.append('')
        #     describes = tool.split_text(describe)
        #     all_text.extend(describes)
        #     if origin is not None and size is not None:
        #         origins = origin.split(',')
        #         sizes = size.split(',')
        #         for text in describes:
        #             place_in_text1 = []
        #             place_in_text2 = []
        #             for place1 in origins:
        #                 for place2 in sizes:
        #                     if place1 in text and place2 in text:
        #                         place_in_text1.append(place1)
        #                         place_in_text2.append(place2)
        #             all_origin.append(','.join(place_in_text1))
        #             all_size.append(','.join(place_in_text2))
        #             all_trans.append('')
        #     elif origin is not None and size is None:
        #         origins = origin.split(',')
        #         for text in describes:
        #             place_in_text = []
        #             for place in origins:
        #                 if place in text:
        #                     place_in_text.append(place)
        #             all_origin.append(','.join(place_in_text))
        #             all_size.append('')
        #             all_trans.append('')
        #     else:
        #         for i in range(len(describes)):
        #             all_origin.append('')
        #             all_size.append('')
        #             all_trans.append('')
        # else:
        #     describes = tool.split_text(describe)
        #     all_text.extend(describes)
        #     if origin is not None and size is not None:
        #         origins = origin.split(',')
        #         sizes = size.split(',')
        #         for text in describes:
        #             place_in_text1 = []
        #             place_in_text2 = []
        #             for place1 in origins:
        #                 for place2 in sizes:
        #                     if place1 in text and place2 in text:
        #                         place_in_text1.append(place1)
        #                         place_in_text2.append(place2)
        #             all_origin.append(','.join(place_in_text1))
        #             all_size.append(','.join(place_in_text2))
        #     elif origin is not None and size is None:
        #         origins = origin.split(',')
        #         for text in describes:
        #             place_in_text = []
        #             for place in origins:
        #                 if place in text:
        #                     place_in_text.append(place)
        #             all_origin.append(','.join(place_in_text))
        #             all_size.append('')
        #     elif origin is None and size is None:
        #         for text in describes:
        #             all_origin.append('')
        #             all_size.append('')
        #     if tran is not None:
        #         trans = tran.split(',')
        #         for text in describes:
        #             place_in_text = []
        #             for place in trans:
        #                 if place in text and text.__contains__('转移'):
        #                     place_in_text.append(place)
        #             all_trans.append(','.join(place_in_text))
        #     else:
        #         for text in describes:
        #             all_trans.append('')

    assert len(all_text) == len(all_trans) and len(all_trans) == len(all_size) and len(all_trans) == len(all_origin), 'len(all_trans) != len(all_size) or len(all_trans) != len(all_origin)'
    for i in range(len(all_text)):
        line = i+2
        ws1.cell(line,1,all_text[i])
        ws1.cell(line,2,all_origin[i])
        ws1.cell(line,3,all_size[i])
        ws1.cell(line,4,all_trans[i])
    wb1.save(save_path)
    logger.info('Finished cut {}.xlsx'.format(file))
Exemplo n.º 20
0
 def predict_test(self, path=None, model_name=None, save_path=None):
     if path is None:
         path = self.config.test_path
         model_name = self.config.model_path.format(
             self.config.experiment_name)
         save_path = self.config.unformated_val_path.format(
             self.config.experiment_name)
     train_data = self.tool.load_data(self.config.train_path,
                                      self.config.is_bioes)
     dev_data = self.tool.load_data(self.config.dev_path,
                                    self.config.is_bioes)
     logger.info('Finished load data')
     logger.info('Building vocab ...')
     model = None
     if self.config.is_pretrained_model:
         with open(self.config.pretrained_vocab, 'r',
                   encoding='utf-8') as vocab_file:
             vocab_list = vocab_file.readlines()
         word_vocab = self.tool.get_text_vocab(vocab_list)
     else:
         if self.config.model_name == 'FLAT':
             bigram_vocab = self.tool.get_bigram_vocab(train_data, dev_data)
             lattice_vocab = self.tool.get_text_vocab(train_data, dev_data)
         else:
             word_vocab = self.tool.get_text_vocab(train_data, dev_data)
     # vectors = lattice_vocab.vectors
     vectors = None
     tag_vocab = self.tool.get_tag_vocab(train_data, dev_data)
     logger.info('Finished build vocab')
     if self.config.is_hidden_tag:
         self.hidden_tag_vocab = self.tool.get_hidden_tag_vocab(
             train_data, dev_data)
         model = self.init_model(self.config,
                                 len(word_vocab),
                                 len(tag_vocab),
                                 len(self.hidden_tag_vocab),
                                 vectors=vectors)
     elif self.config.model_name == 'FLAT':
         model = self.init_model(self.config,
                                 len(bigram_vocab),
                                 len(lattice_vocab),
                                 len(tag_vocab),
                                 vectors=vectors,
                                 n_bigram=None)
     else:
         model = self.init_model(self.config,
                                 len(word_vocab),
                                 len(tag_vocab),
                                 None,
                                 vectors=vectors)
     model.load_state_dict(torch.load(model_name))
     wb = load_workbook(filename=path)
     ws = wb['sheet1']
     max_row = ws.max_row
     f = open(self.config.vocab_path, 'r')
     lines = f.readlines()
     w_list = []
     for line in lines:
         splited = line.strip().split(' ')
         w = splited[0]
         w_list.append(w)
     w_trie = Trie()
     for w in w_list:
         w_trie.insert(w)
     for line_num in tqdm(range(max_row - 1)):
         line_num += 2
         sentence = ws.cell(line_num, 1).value
         # index_size = {}
         # chars = ['.', '*', '×', 'X', 'x', 'c', 'C', 'm', 'M']
         # starts = []
         # ends = []
         # i = 0
         # while i < len(sentence):
         #     if sentence[i] in chars or sentence[i].isdigit():
         #         S_start = i
         #         while i + 1 < len(sentence) and (sentence[i + 1] in chars or sentence[i + 1].isdigit()):
         #             i += 1
         #         if sentence[S_start:i + 1].__contains__('M') or sentence[S_start:i + 1].__contains__('m'):
         #             starts.append(S_start)
         #             ends.append(i)
         #         i += 1
         #     else:
         #         i += 1
         # sentence.replace('$', '')
         # new_sentence = [c for c in sentence]
         # width = 0
         # if len(starts) != 0:
         #     for i in range(len(starts)):
         #         start_i = starts[i] - width
         #         index_size[start_i] = sentence[starts[i]:ends[i] + 1]
         #         for j in range(ends[i] - starts[i]):
         #             del new_sentence[start_i]
         #         new_sentence[start_i] = '$'
         #         width += ends[i] - starts[i]
         #         a = 0
         # sentence = ''.join(new_sentence)
         sentence1 = []
         tag_pred = []
         if self.config.model_name == 'FLAT':
             texts = self.tool.split_text(sentence)
             for text in texts:
                 sentence1.extend(text)
                 bigram1 = get_bigram(text)
                 bigram = torch.tensor(
                     numpy.array([bigram_vocab.stoi[bi] for bi in bigram1],
                                 dtype='int64')).unsqueeze(1).expand(
                                     len(bigram1),
                                     self.config.batch_size).to(device)
                 lattice1 = list(text) + w_trie.get_lexicon(text)
                 lattice = torch.tensor(
                     numpy.array(
                         [lattice_vocab.stoi[word] for word in lattice1],
                         dtype='int64')).unsqueeze(1).expand(
                             len(lattice1),
                             self.config.batch_size).to(device)
                 lattice_len = torch.tensor(
                     numpy.array([len(lattice1)], dtype='int64')).expand(
                         self.config.batch_size).to(device)
                 result = model(bigram, lattice, lattice_len)[0]
                 for k in result:
                     tag_pred.append(tag_vocab.itos[k])
         else:
             texts = self.tool.split_text(sentence)
             for text in texts:
                 sentence1.extend(text)
                 text = torch.tensor(
                     numpy.array([word_vocab.stoi[word] for word in text],
                                 dtype='int64')).unsqueeze(1).expand(
                                     len(text),
                                     self.config.batch_size).to(device)
                 text_len = torch.tensor(
                     numpy.array([len(text)], dtype='int64')).expand(
                         self.config.batch_size).to(device)
                 result = model(text, text_len)[0]
                 for k in result:
                     tag_pred.append(tag_vocab.itos[k])
         sentence1 = ''.join(sentence1)
         i = 0
         origin_places = []
         sizes = []
         transfered_places = []
         while i < len(tag_pred):
             if self.config.is_bioes:
                 start = end = 0
                 if tag_pred[i][:1] == 'B':
                     kind = tag_pred[i][2:]
                     start = i
                     end = i
                     while end + 1 < len(sentence1) and (
                             tag_pred[end + 1][0] == 'I'
                             or tag_pred[end + 1][0]
                             == 'E') and tag_pred[end + 1][2:] == kind:
                         end += 1
                     if kind == 'origin_place':
                         origin_places.append(sentence1[start:end + 1])
                     elif kind == 'size':
                         sizes.append(sentence1[start:end + 1])
                     else:
                         transfered_places.append(sentence1[start:end + 1])
                     i = end + 1
                 elif tag_pred[i][:1] == 'E':
                     kind = tag_pred[i][2:]
                     start = i
                     end = i
                     if kind == 'origin_place':
                         origin_places.append(sentence1[start:end + 1])
                     elif kind == 'size':
                         # sizes.append(index_size[start])
                         sizes.append(sentence1[start:end + 1])
                     else:
                         transfered_places.append(sentence1[start:end + 1])
                     i += 1
                 else:
                     i += 1
             else:
                 start = end = 0
                 if tag_pred[i][:1] == 'B':
                     kind = tag_pred[i][2:]
                     start = end = i
                     while end + 1 < len(sentence1) and tag_pred[
                             end + 1][0] == 'I' and tag_pred[end +
                                                             1][2:] == kind:
                         end += 1
                     if kind == 'origin_place':
                         origin_places.append(sentence1[start:end + 1])
                     elif kind == 'size':
                         # sizes.append(index_size[start])
                         sizes.append(sentence1[start:end + 1])
                     else:
                         transfered_places.append(sentence1[start:end + 1])
                     i = end + 1
                 else:
                     i += 1
             # if tag_pred[i]!='O':
             #     start = i
             #     kind = tag_pred[i][2:]
             #     while i+1<len(tag_pred) and tag_pred[i+1][2:]==kind:
             #         i+=1
             #     end = i + 1
             #     if kind == 'origin_place':
             #         origin_places.append(sentence1[start:end])
             #     elif kind == 'size':
             #         sizes.append(index_size[start])
             #     else:
             #         transfered_places.append(sentence1[start:end])
             # i+=1
         for places in [origin_places, sizes, transfered_places]:
             for place in places:
                 if place == []:
                     places.remove(place)
         ws.cell(line_num, 2).value = ','.join(list(set(origin_places)))
         ws.cell(line_num, 3).value = ','.join(list(set(sizes)))
         ws.cell(line_num, 4).value = ','.join(list(set(transfered_places)))
     wb.save(filename=save_path)
     logger.info('Finished Predicting...')
Exemplo n.º 21
0
async def fetch_new_proxy_task():
    logger.info("run fetch_new_proxy_task")
    await spider.run_spider()
    await verifier.verify_new_proxy()
    # await verify_error_proxy_task()
    await update_squid_task()
Exemplo n.º 22
0
 def test_format_result(self):
     self.train_data = self.tool.load_data(self.config.train_path)
     self.dev_data = self.tool.load_data(self.config.dev_path)
     tag_vocab = self.tool.get_tag_vocab(self.train_data, self.dev_data)
     self.predict_test(
         path=self.config.dev_path,
         save_path=self.config.test_unformated_val_path.format(
             self.config.experiment_name))
     tag_true = []
     tag_formated_pred = []
     tag_unformated_pred = []
     format_result(path=self.config.test_unformated_val_path.format(
         self.config.experiment_name),
                   save_path=self.config.test_formated_val_path.format(
                       self.config.experiment_name))
     dev_data = self.tool.load_data(self.config.dev_path)
     formated_dev_data = self.tool.load_data(
         self.config.test_formated_val_path.format(
             self.config.experiment_name))
     unformated_dev_data = self.tool.load_data(
         self.config.test_unformated_val_path.format(
             self.config.experiment_name))
     assert len(dev_data.examples) == len(
         unformated_dev_data.examples
     ), 'train_dev_data:{} != unformated_train_dev_data:{}'.format(
         len(dev_data.examples), len(unformated_dev_data.examples))
     assert len(dev_data.examples) == len(
         formated_dev_data.examples
     ), 'train_dev_data:{} != formated_train_dev_data:{}'.format(
         len(dev_data.examples), len(formated_dev_data.examples))
     for example1 in dev_data.examples:
         tag_true.extend(example1.tag)
     for example2 in formated_dev_data.examples:
         tag_formated_pred.extend(example2.tag)
     for example3 in unformated_dev_data.examples:
         tag_unformated_pred.extend(example3.tag)
     # the eval of unformated result
     for i in range(len(dev_data)):
         pass
     assert len(tag_true) == len(
         tag_unformated_pred), 'tag_true:{} != tag_pred:{}'.format(
             len(tag_true), len(tag_unformated_pred))
     assert len(tag_true) == len(
         tag_formated_pred), 'tag_true:{} != tag_pred:{}'.format(
             len(tag_true), len(tag_formated_pred))
     labels = []
     for index, label in enumerate(tag_vocab.itos):
         labels.append(label)
     labels.remove('O')
     prf_dict_formated = classification_report(tag_true,
                                               tag_formated_pred,
                                               labels=labels,
                                               output_dict=True)
     prf_dict_unformated = classification_report(tag_true,
                                                 tag_unformated_pred,
                                                 labels=labels,
                                                 output_dict=True)
     # the eval of formated result
     logger.info('unformated report{}'.format(
         prf_dict_formated['weighted avg']))
     logger.info('formated report{}'.format(
         prf_dict_unformated['weighted avg']))
Exemplo n.º 23
0
def data_clean_test(path='./task2_no_val{}.xlsx'):
    wb = load_workbook(path.format(''))
    ws = wb['sheet1']
    max_row = ws.max_row
    wb1 = Workbook()
    ws1 = wb1.create_sheet('sheet1')
    wb1.remove(wb1['Sheet'])
    names = ['原文', '肿瘤原发部位', '原发病灶大小', '转移部位']
    for i in range(len(names)):
        ws1.cell(1, i + 1, names[i])
    for i in range(max_row - 1):
        line = i + 2
        new_sentence = ''
        chars = ['.','*','×','X','x','c','C','m','M']
        # o_chars = ['_x0004_', '�', ':', ',', ';']
        # t_chars = ['', '', ':', ',', ';']
        o_chars = ['�']
        t_chars = ['']
        for i in range(4):
            if i==0:
                if '检测值' in ws.cell(line, i + 1).value:
                    new_sentence = ws.cell(line, i + 1).value
                    for j in range(len(o_chars)):
                        new_sentence = new_sentence.replace(o_chars[j], t_chars[j])
                else:
                    new_sentence = ws.cell(line, i+1).value.replace(' ', '')
                    # new_sentence = ws.cell(line, i + 1).value
                    for j in range(len(o_chars)):
                        new_sentence = new_sentence.replace(o_chars[j], t_chars[j])
                j = 0
                while j < len(new_sentence):
                    while j < len(new_sentence) and not new_sentence[j].isdigit():
                        j+=1
                    start = j
                    end = start
                    while end+1<len(new_sentence) and (new_sentence[end+1] in chars or new_sentence[end+1].isdigit()):
                        end+=1
                    if new_sentence[start:end+1].__contains__('m') or new_sentence[start:end+1].__contains__('M'):
                        old_size = new_sentence[start:end+1]
                        new_size = ''
                        nums = ''
                        k=start
                        flag = False
                        while k<=end:
                            while new_sentence[k].isdigit() or new_sentence[k]=='.':
                                nums+=new_sentence[k]
                                k+=1
                                flag=True
                            if flag:
                                nums+=','
                                flag=False
                            k+=1
                        nums = nums[:-1].split(',')
                        if old_size.__contains__('c') or old_size.__contains__('C'):
                            for num in nums:
                                new_size = new_size+num+'CM'+'×'
                            new_size = new_size[:-1]
                        else:
                            for num in nums:
                                new_size = new_size+num+'MM'+'×'
                            new_size = new_size[:-1]
                        j=end
                        new_sentence = new_sentence.replace(old_size,new_size)
                    j+=1
                ws1.cell(line,i+1,new_sentence)
    wb1.save(path.format('_cleaned'))
    logger.info('Finished val cleaned data')
Exemplo n.º 24
0
 def predict_sentence(self, model_name=None):
     if model_name is None:
         model_name = self.config.model_path.format(
             self.config.experiment_name)
     train_data = self.tool.load_data(self.config.train_path,
                                      self.config.is_bioes)
     dev_data = self.tool.load_data(self.config.dev_path,
                                    self.config.is_bioes)
     logger.info('Finished load data')
     logger.info('Building vocab ...')
     model = None
     # if self.config.is_pretrained_model:
     #     with open(self.config.pretrained_vocab, 'r', encoding='utf-8') as vocab_file:
     #         vocab_list = vocab_file.readlines()
     #     word_vocab = self.tool.get_text_vocab(vocab_list)
     # else:
     #     word_vocab = self.tool.get_text_vocab(train_data, dev_data)
     # vectors = word_vocab.vectors
     # tag_vocab = self.tool.get_tag_vocab(train_data, dev_data)
     # logger.info('Finished build vocab')
     # if self.config.is_hidden_tag:
     #     self.hidden_tag_vocab = self.tool.get_hidden_tag_vocab(train_data, dev_data)
     #     model = self.init_model(self.config, len(self.word_vocab), len(self.tag_vocab), len(self.hidden_tag_vocab),
     #                             vectors=vectors)
     # else:
     #     model = self.init_model(self.config, len(word_vocab), len(tag_vocab), None, vectors=vectors)
     # model.load_state_dict(torch.load(model_name))
     if self.config.is_pretrained_model:
         with open(self.config.pretrained_vocab, 'r',
                   encoding='utf-8') as vocab_file:
             vocab_list = vocab_file.readlines()
         word_vocab = self.tool.get_text_vocab(vocab_list)
     else:
         if self.config.model_name == 'FLAT':
             bigram_vocab = self.tool.get_bigram_vocab(train_data, dev_data)
             lattice_vocab = self.tool.get_text_vocab(train_data, dev_data)
         else:
             word_vocab = self.tool.get_text_vocab(train_data, dev_data)
     vectors = None
     tag_vocab = self.tool.get_tag_vocab(train_data, dev_data)
     logger.info('Finished build vocab')
     if self.config.is_hidden_tag:
         self.hidden_tag_vocab = self.tool.get_hidden_tag_vocab(
             train_data, dev_data)
         model = self.init_model(self.config,
                                 len(word_vocab),
                                 len(tag_vocab),
                                 len(self.hidden_tag_vocab),
                                 vectors=vectors)
     elif self.config.model_name == 'FLAT':
         model = self.init_model(self.config,
                                 len(bigram_vocab),
                                 len(lattice_vocab),
                                 len(tag_vocab),
                                 vectors=vectors,
                                 n_bigram=None)
     else:
         model = self.init_model(self.config,
                                 len(word_vocab),
                                 len(tag_vocab),
                                 None,
                                 vectors=vectors)
     model.load_state_dict(torch.load(model_name))
     f = open(self.config.vocab_path, 'r')
     lines = f.readlines()
     w_list = []
     for line in lines:
         splited = line.strip().split(' ')
         w = splited[0]
         w_list.append(w)
     w_trie = Trie()
     for w in w_list:
         w_trie.insert(w)
     while True:
         print('请输入sentence:')
         sentence = input()
         # texts = self.tool.split_text(sentence)
         # tag_pred = []
         # sentence1 = []
         # for text in texts:
         #     sentence1.extend(text)
         #     text = torch.tensor(numpy.array([word_vocab.stoi[word] for word in text], dtype='int64')).unsqueeze(
         #         1).expand(len(text), self.config.batch_size).to(device)
         #     text_len = torch.tensor(numpy.array([len(text)], dtype='int64')).expand(self.config.batch_size).to(
         #         device)
         #     result = model(text, text_len)[0]
         #     for k in result:
         #         tag_pred.append(tag_vocab.itos[k])
         # sentence1 = ''.join(sentence1)
         # i = 0
         sentence1 = []
         if self.config.model_name == 'FLAT':
             texts = self.tool.split_text(sentence)
             tag_pred = []
             for text in texts:
                 sentence1.extend(text)
                 bigram1 = get_bigram(text)
                 bigram = torch.tensor(
                     numpy.array([bigram_vocab.stoi[bi] for bi in bigram1],
                                 dtype='int64')).unsqueeze(1).expand(
                                     len(bigram1),
                                     self.config.batch_size).to(device)
                 lattice1 = list(text) + w_trie.get_lexicon(text)
                 lattice = torch.tensor(
                     numpy.array(
                         [lattice_vocab.stoi[word] for word in lattice1],
                         dtype='int64')).unsqueeze(1).expand(
                             len(lattice1),
                             self.config.batch_size).to(device)
                 lattice_len = torch.tensor(
                     numpy.array([len(lattice1)], dtype='int64')).expand(
                         self.config.batch_size).to(device)
                 result = model(bigram, lattice, lattice_len)[0]
                 for k in result:
                     tag_pred.append(tag_vocab.itos[k])
         else:
             texts = self.tool.split_text(sentence)
             tag_pred = []
             for text in texts:
                 sentence1.extend(text)
                 text = torch.tensor(
                     numpy.array([word_vocab.stoi[word] for word in text],
                                 dtype='int64')).unsqueeze(1).expand(
                                     len(text),
                                     self.config.batch_size).to(device)
                 text_len = torch.tensor(
                     numpy.array([len(text)], dtype='int64')).expand(
                         self.config.batch_size).to(device)
                 result = model(text, text_len)[0]
                 for k in result:
                     tag_pred.append(tag_vocab.itos[k])
         sentence1 = ''.join(sentence1)
         i = 0
         origin_places = []
         sizes = []
         transfered_places = []
         while i < len(tag_pred):
             start = 0
             end = 0
             kind = None
             if tag_pred[i] != 'O':
                 start = i
                 kind = tag_pred[i][2:]
                 while i + 1 < len(tag_pred) and tag_pred[i +
                                                          1][2:] == kind:
                     i += 1
                 end = i + 1
                 if kind == 'origin_place':
                     origin_places.append(sentence1[start:end])
                 elif kind == 'size':
                     sizes.append(sentence1[start:end])
                 else:
                     transfered_places.append(sentence1[start:end])
             i += 1
         # print(sentence1)
         # print(tag_pred)
         for i in range(len(sentence1)):
             print(sentence1[i], tag_pred[i])
         print(origin_places)
         print(sizes)
         print(transfered_places)
Exemplo n.º 25
0
async def update_squid_task():
    logger.info("run update_squid_task")
    s = sess_maker()
    proxies = s.query(Proxy).filter(Proxy.status == STATUS_OK).all()
    s.close()
    squid.update_conf(proxies)