예제 #1
0
def upload_catalog_files():
    try:
        user_id = ObjectId(str(request.form.get('user_id')))
        catalog_id = request.form.get('catalog_id')
        catalog_name = str(request.form.get('catalog_name')).strip()
        uploaded_files = request.files.getlist("file")
        if not validate_upload_files(uploaded_files):
            response = {
                "status":
                "error",
                "message":
                "Invalid file types. Please upload only .txt, .pdf, .docx files."
            }
            return json.dumps(response)
        user = validate_user(user_id)
        if user is None:
            response = {
                "status": "error",
                "message": "Invalid user_id '{}'".format(user_id)
            }
            return json.dumps(response)
        if catalog_id == 'null':
            user_catalog = validate_catalog(user=user,
                                            catalog_name=catalog_name,
                                            create=True)
        else:
            user_catalog = validate_catalog(user=user,
                                            catalog_name=catalog_name)
        if not user_catalog:
            response = {
                "status": "error",
                "message": "Invalid catalog_id '{}'".format(catalog_id)
            }
            return json.dumps(response)
        for up_file in uploaded_files:
            up_file_doc = validate_catalog_file(catalog=user_catalog,
                                                file_name=up_file.filename)
            if up_file_doc['is_entity_extracted']:
                update_file(up_file_doc)
            save_file(file=up_file, file_doc=up_file_doc)
        if "data_un_extracted_files" not in user_catalog:
            USER_CATALOGS_COL.update_one(
                {"_id": user_catalog["_id"]},
                {"$set": {
                    "data_un_extracted_files": len(uploaded_files)
                }})
        else:
            USER_CATALOGS_COL.update_one({"_id": user_catalog["_id"]}, {
                "$set": {
                    "data_un_extracted_files":
                    user_catalog["data_un_extracted_files"] +
                    len(uploaded_files)
                }
            })
        return json.dumps({
            'status':
            'success',
            'message':
            '{} file(s) uploaded into "{}" catalog.'.format(
                len(uploaded_files), catalog_name)
        })
    except Exception as err:
        return json.dumps({'status': 'error', 'message': str(err)})
예제 #2
0
def process_catalog_files():
    try:
        req_data = request.get_json()
        user_id = ObjectId(str(req_data['user_id']))
        catalog_id = ObjectId(str(req_data['catalog_id']))
        user = validate_user(user_id)
        if user is None:
            response = {
                "status": "error",
                "message": "Invalid user_id '{}'".format(user_id)
            }
            return json.dumps(response)
        user_cat_col = USER_CATALOGS_COL.find_one({
            "user": user["_id"],
            "_id": catalog_id
        })
        if not user_cat_col:
            response = {
                "status": "error",
                "message": "Invalid catalog_id '{}'".format(catalog_id)
            }
            return json.dumps(response)
        catalog_resumes = list(
            CATALOG_FILES_COL.find({
                "catalog": user_cat_col['_id'],
                "is_entity_extracted": False,
                "is_manually_updated": False
            }))
        nlp_obj = TrainModel(model='Resume_Keyword_Extraction')
        for file_doc in catalog_resumes:
            file_full_path = os.path.join(UPLOAD_FILE_PATH,
                                          str(file_doc["catalog"]),
                                          file_doc["file_name"])
            file_type = file_doc['file_name'].rsplit('.', 1)[1].lower()
            if file_type == "txt":
                file_data = get_text_from_text_file(file_full_path)
            elif file_type == "pdf":
                file_data = extract_text_from_pdf_file(file_full_path)
            elif file_type == "docx":
                file_data = get_text_from_docx_file(file_full_path)
            entity_data = nlp_obj.get_entities(text=file_data)
            entity_data = json.loads(entity_data)
            entity_data['is_entity_extracted'] = True
            CATALOG_FILES_COL.update({"_id": file_doc['_id']},
                                     {"$set": entity_data})
        if "data_un_extracted_files" in user_cat_col:
            data_un_extracted_files_count = user_cat_col[
                "data_un_extracted_files"] - len(list(catalog_resumes))
        else:
            data_un_extracted_files_count = 0
        USER_CATALOGS_COL.update_one({"_id": user_cat_col["_id"]}, {
            "$set": {
                "prev_data_extracted_date": datetime.now(),
                "data_un_extracted_files": data_un_extracted_files_count
            }
        })
        return json.dumps({
            'status':
            'success',
            'message':
            'Data extracted from "{}" catalog files'.format(
                user_cat_col["name"])
        })
    except Exception as err:
        return json.dumps({'status': 'error', 'message': str(err)})