Exemplo n.º 1
0
def on_loop(project_id):
    # docresponse = get_documenttask(projid=project_id)
    # docdata = pd.DataFrame(docresponse)
    docdata = get_new_doc_task_db(project_id, 'docx')
    if len(docdata) == 0:
        return

    # docdata = docdata[(docdata['step'] == 1) & (docdata['fileType'] == 'dwg')]
    docdata = docdata.tail(config.n_for_project_in_loop)
    docdata.columns = [s[0].lower() + s[1:] for s in docdata.columns]

    docdata = (docdata.dropna(subset=['fileUrl', 'step']).reset_index())

    # docdata = (docdata.sort_values('name')
    #            .dropna(subset=['fileUrl', 'step'])
    #            .reset_index()
    #            )

    # basepath = os.path.join(config.root_dir, str(project_id))
    basepath = config.root_dir
    imgdir = os.path.join(config.root_dir, 'images')
    for indx, dt in docdata.iterrows():
        dt['createTime'] = str(dt['createTime'].asm8)
        print(datetime.now())
        info_log_obj = {'id': dt['fileId'], 'name': dt['name']}
        # analysis_log('开始', info_log_obj)
        if not dt['fileUrl'].startswith('http'):
            dt['step'] = 6
            change_step(dt['id'], dt.to_dict(), projid=project_id)
            analysis_log('无文件', info_log_obj)
            continue

        # 不分析一些类型
        no_analysis = False
        for tp in config.skip_file_types:
            if not dt['fileType'] or tp in dt['fileType']:
                dt['step'] = 5
                change_step(dt['id'], dt.to_dict(), projid=project_id)
                info_log_obj['type'] = dt['fileType']
                analysis_log('跳过类型', info_log_obj)
                no_analysis = True
                break
        if no_analysis:
            continue

        try:
            # 下载文件到本地文件夹
            curpath = os.path.join(basepath, dt['name'])
            download_doc(dt['fileUrl'], curpath)
        except:
            analysis_log('下载文件', info_log_obj)
            continue

        # 转换文件
        try:
            # 很大的
            if os.path.getsize(curpath) > 300 * 1000 * 1000:
                analysis_log('文件过大', info_log_obj)
                dt['step'] = 4
                change_step(dt['id'], dt.to_dict(), projid=project_id)
                analysis_log('完成', info_log_obj)
                continue

            ext_tuple = os.path.splitext(dt['name'])
            fname = ext_tuple[0]
            extname = ext_tuple[1]
            transformed = core.transform(curpath, basepath, extname)
        except:
            analysis_log('转换文件', info_log_obj)
            continue

        # 分析成字段
        try:
            kwords, kwfreq, pharr, nwarr, sumarr, attaimges, *drawing_none = core.analysis(
                curpath, extname, imgdir=imgdir, do_drawings=True)

            kwords_arr = kwords.split(',')
            real_kwords = []
            for kw in kwords_arr:
                if is_real_kw(kw):
                    real_kwords.append(kw)
            if len(real_kwords) > 5:
                low_kw = real_kwords[5:]
            else:
                low_kw = []
        except Exception as e:
            dt['step'] = 7
            change_step(dt['id'], dt.to_dict(), projid=project_id)
            analysis_log('分析成字段', info_log_obj)
            print(e)
            continue

        # 图片附件
        try:
            # 上传oss
            upload_result = core.upload_images(attaimges)

            # 写入附件表
            for atta in upload_result:
                atta_obj = {
                    "name": atta['name'],
                    "remark": "",
                    "keyword": "",
                    "abstract": utils.remove_blank(atta['abstract']),
                    "url": atta['url'],
                    "fileSize": atta['fileSize'],
                    "fileType": atta['fileType'],
                    "newWords": "",
                    "wordFrequency": "",
                    "phrases": "",
                    "linkType": "文件关联图片",
                    "fileId": dt['fileId']
                }
                add_attachment(atta_obj, projid=project_id)
        except Exception as e:
            print(e)
            analysis_log('图片附件', info_log_obj)
            continue

        # 文件表写入字段
        file_table_write_success = False
        try:
            doc_record = get_docs_byid(dt['fileId'], projid=project_id)

            # choose summary
            real_summary = []
            for su in sumarr:
                if is_real_summary(su):
                    real_summary.append(su)
            summarylimit = 3
            if len(real_summary) > summarylimit:
                real_summary = sorted(real_summary,
                                      key=lambda x: len(x),
                                      reverse=True)[:summarylimit]

            nwlimit = 900
            nwarr = utils.remove_blank(nwarr)
            if len(nwarr) > nwlimit:
                nwarr = nwarr[:nwlimit]
            updated = {
                # "keyWord": kwords,
                "keyWord": ','.join(low_kw),
                "abstract": ','.join(real_summary),
                "newWords": nwarr,
                "wordFrequency": kwfreq,
                "phrases": pharr
            }

            doc_record.update(updated)
            # print(doc_record)
            fill_docinfo(doc_record['id'], doc_record, projid=project_id)
            file_table_write_success = True
        except Exception as e:
            analysis_log('文件表填入', info_log_obj)
            print(e)
            continue

        # 创建新标签并关联
        try:
            if not real_kwords:
                analysis_log('无内容', info_log_obj)
            else:
                alltags = get_doctag(projid=project_id)
                if len(real_kwords) >= config.web_keywords_num:
                    curtags = real_kwords[:config.web_keywords_num]
                else:
                    curtags = real_kwords
                dtrels = []
                for curtag in curtags:
                    existq = False
                    for t in alltags:
                        if str(t['name']).upper() == str(curtag).upper():
                            dtrels.append((dt['fileId'], t['id']))
                            existq = True
                            break
                    if not existq:
                        tagid = create_doctag(curtag, projid=project_id)
                        dtrels.append((dt['fileId'], tagid))
                # 写入关联文件和标签
                create_doctagrel(dtrels, projid=project_id)
        except Exception as e:
            analysis_log('标签', info_log_obj)
            print(e)
            continue

        # 更改task的阶段为已完成
        if file_table_write_success:
            dt['step'] = 2
            change_step(dt['id'], dt.to_dict(), projid=project_id)

        # 删除本地下载文件
        pass
        analysis_log('完成', info_log_obj)

    # delete_doctagrel(13, projid=project_id)
    print('end proj')
def on_loop(project_id):
    docresponse = get_documenttask(projid=project_id)
    docdata = pd.DataFrame(docresponse)

    if len(docdata) == 0:
        return

    docdata = docdata[docdata['step'] == 1]
    docdata = docdata.tail(config.n_for_project_in_loop)

    docdata = (docdata
               # .sort_values('name')
               .dropna(subset=['fileUrl', 'step'])
               .reset_index()
               )

    # basepath = os.path.join(config.root_dir, str(project_id))
    basepath = r'E:\file-local-analysis'
    for indx, dt in docdata.iterrows():
        info_log_obj = {'id': dt['fileId'], 'name': dt['name']}
        print()
        analysis_log('开始', info_log_obj)

        # if not dt['fileUrl'].startswith('http'):
        #     analysis_log('无文件', info_log_obj)
        #     continue

        try:
            # curpath = os.path.join(basepath, dt['name'])
            curpath = dt['fileUrl']

            # transformed = core.transform(curpath, basepath, extname)
            ext_tuple = os.path.splitext(dt['name'])
            extname = ext_tuple[1]

            # 补写
            # if extname != '.dwg' and extname != '.rar':
            #     continue
            # analysis_log('开始', info_log_obj)
            # 补写

            if extname == '.doc':
                transdoc.doc2docx(curpath, basepath, remove=False)
                curpath = os.path.join(basepath, dt['name'])
            if extname == '.ppt':
                transppt.ppt2pptx(curpath, basepath, remove=False)
                curpath = os.path.join(basepath, dt['name'])

            # dwg rar本地转移 在线分析不用
            if extname == '.dwg':
                shutil.copy(curpath, basepath)
                curpath = os.path.join(basepath, dt['name'])
            if extname == '.rar' or extname == '.zip':
                shutil.copy(curpath, basepath)
                curpath = os.path.join(basepath, dt['name'])

            # 很大的
            if os.path.getsize(dt['fileUrl']) > 100 * 1000 * 1000:
                analysis_log('文件过大', info_log_obj)
                dt['step'] = 2
                change_step(dt['id'], dt.to_dict(), projid=project_id)
                continue
        except Exception as e:
            analysis_log('下载和转换文件', info_log_obj)
            continue

        # 分析成字段
        try:
            kwords, kwfreq, pharr, nwarr, sumarr, *img_none = core.analysis(
                curpath, extname, imgdir=None, do_drawings=True)

            kwords_arr = kwords.split(',')
            real_kwords = []
            for kw in kwords_arr:
                if is_real_kw(kw):
                    real_kwords.append(kw)
            if len(real_kwords) > 5:
                low_kw = real_kwords[5:]
            else:
                low_kw = []
        except Exception as e:
            analysis_log('分析成字段', info_log_obj)
            print(e)

            # avoid always fail
            dt['step'] = 2
            change_step(dt['id'], dt.to_dict(), projid=project_id)
            # avoid always fail
            continue

        # 文件表写入字段
        file_table_write_success = False
        try:
            doc_record = get_docs_byid(dt['fileId'], projid=project_id)

            # choose summary
            real_summary = []
            for su in sumarr:
                if is_real_summary(su):
                    real_summary.append(su)
            summarylimit = 3
            if len(real_summary) > summarylimit:
                real_summary = sorted(real_summary,
                                      key=lambda x: len(x),
                                      reverse=True)[:summarylimit]

            nwlimit = 900
            nwarr = utils.remove_blank(nwarr)
            if len(nwarr) > nwlimit:
                nwarr = nwarr[:nwlimit]
            updated = {
                # "keyWord": kwords,
                "keyWord": ','.join(low_kw),
                "abstract": ','.join(real_summary),
                "newWords": nwarr,
                "wordFrequency": kwfreq,
                "phrases": pharr
            }

            doc_record.update(updated)
            # print(doc_record)
            fill_docinfo(doc_record['id'], doc_record, projid=project_id)
            file_table_write_success = True
        except Exception as e:
            analysis_log('文件表填入', info_log_obj)
            continue

        # 创建新标签并关联
        try:
            if not real_kwords:
                analysis_log('无内容', info_log_obj)
            else:
                alltags = get_doctag(projid=project_id)
                if len(real_kwords) >= config.web_keywords_num:
                    curtags = real_kwords[:config.web_keywords_num]
                else:
                    curtags = real_kwords
                dtrels = []
                for curtag in curtags:
                    existq = False
                    for t in alltags:
                        if str(t['name']).upper() == str(curtag).upper():
                            dtrels.append((dt['fileId'], t['id']))
                            existq = True
                            break
                    if not existq:
                        tagid = create_doctag(curtag, projid=project_id)
                        dtrels.append((dt['fileId'], tagid))
                # 写入关联文件和标签
                create_doctagrel(dtrels, projid=project_id)
        except:
            analysis_log('标签', info_log_obj)
            continue

        # 更改task的阶段为已完成
        if file_table_write_success:
            dt['step'] = 2
            change_step(dt['id'], dt.to_dict(), projid=project_id)

        # 删除本地下载文件
        pass
        analysis_log('完成', info_log_obj)

    # delete_doctagrel(13, projid=project_id)
    print('end proj')