Пример #1
0
def make_jsonFile(dic_path, doc_path, file_name):
    igate_cl = generate_Dictionary(dic_path)

    # 목차를 이용하여 사전 생성
    index_dictionary = igate_cl.DICTIONRAY_LIST

    json_file = []
    # 문서 로드
    doc_result = docx2python(doc_path)
    main_title = ''
    sub_title = ''
    title = ''
    index_len = ''

    for j in range(1, len(doc_result.body)):
        doc_body = doc_result.body[j]
        doc_len = len(doc_body)

        if doc_len == 1:
            # 문서 스플릿
            doc_list = generate_doc(doc_body, doc_len, index_dictionary)
            print(doc_list)
            for _, content in enumerate(doc_list):
                #print(content)
                for text in content:
                    if text in index_dictionary:
                        #print(text)
                        index = index_dictionary[text]  # index는 해당 목차(사전)의 번호
                        index_len = len(index)

                        # 1: 대분류 / 2: 중분류 / 3: 소분류
                        if index_len == 1:
                            main_title = text
                            sub_title = text
                            title = text

                        elif index_len == 2:
                            sub_title = text
                            title = text

                        elif index_len == 3:
                            title = text
                json_data = generate_doc_to_json(index_len, content,
                                                 main_title, sub_title, title,
                                                 "string")
                json_file.append(json_data)

        else:
            print('ID = {0}'.format(str(j)))
            # 테이블
            table = generate_table(doc_body, doc_len)
            #print(table)
            json_data = generate_doc_to_json(index_len, table, main_title,
                                             sub_title, title, "table")
            json_file.append(json_data)

    with open(path.RESULT_DATA_PATH + file_name + '.json',
              'w',
              encoding='utf-8') as make_file:
        json.dump(json_file, make_file, ensure_ascii=False, indent="\t")
Пример #2
0
    def docx2text(self,filename):
        """
        :param filename: docx源文件
        :return: docx的文字内容
        """

        def flatten(S):
            """
            展平嵌套列表
            :param S: 嵌套列表
            :return: 单个不嵌套的列表
            """
            if S == []:
                return S
            if isinstance(S[0], list):
                return flatten(S[0]) + flatten(S[1:])
            return S[:1] + flatten(S[1:])

        if filename.split('.')[-1] == "docx":
            # 提取文本
            doc_result = docx2python(filename)
            # 展开结果
            res = flatten(doc_result.body)
            # 去除空格
            res = [r for r in res if r.strip()]
            # 返回成原来格式
            content = '。'.join(res)
        elif filename.split('.')[-1] == "doc":
            content = subprocess.check_output(['antiword', filename])
            content = content.decode('utf-8')
        return content
Пример #3
0
def main(path):
    print(path)
    real_path = path
    if path.endswith(".doc"):
        word = wc.Dispatch('Word.Application')
        doc = word.Documents.Open(path)
        doc.SaveAs(path + "x", 12, False, "", True, "", False, False, False,
                   False)
        doc.Close()
        word.Quit()
        real_path = path + "x"
    elif not path.endswith(".docx"):
        print("请检查文件后缀名是否有效!")
        return

    keyword = '参考文献'
    file = docx2python(real_path)
    temp = file.text.split('\n')
    content = []
    for i in range(len(temp)):
        if i % 2 == 0:
            content.append(temp[i])

    # 可以输出reference看看,已经是text了
    reference = extract(keyword, content)
    check(reference)
Пример #4
0
 def post(self, *args, **kwargs):
     uploaded_file = self.request.FILES['document']
     document = docx2python(uploaded_file, self.img_path)
     os.chdir(self.img_path)
     for f in os.listdir(self.img_path):
         if f.endswith('.wmf') or f.endswith('.emf') or f.endswith('.jpeg'):
             i = Image.open(f)
             fn, fext = os.path.splitext(f)
             i.save('{}.png'.format(fn))
     document_content = document.body
     z = document_content[0][0][0]
     composite_list = [z[x:x + 8] for x in range(0, len(z), 8)]
     for row in composite_list:
         try:
             data = Questions.objects.create(subject=row[0],
                                             question=row[1],
                                             ques_image=row[2],
                                             option_a=row[3],
                                             option_b=row[4],
                                             option_c=row[5],
                                             option_d=row[6],
                                             ans=row[7])
         except Exception as e:
             print(e)
     messages.success(self.request, 'Data uploded successfully')
     return redirect("admin/")
Пример #5
0
    def parse_word(self, file_location):

        #Define variables
        combo_filing = []
        exhibits = []
        emerging_growth = []
        period = []
        items = []
        eight_k = {}

        # CREATE WORD DOCUMENT OBJECT
        try:
            doc = docx.Document(file_location)
            docs = docx2python(file_location, extract_image=True)
        except:
            print('Only ".DOCX" files are excepted.')
            return None
        # CREATE PARAGRAPH OBJECT
        paragraphs = doc.paragraphs

        # LOOP THROUGH LINES OF PARAGRAPH. FIND CENTER ALIGNED TEXT THAT EQUAL "FORM 8-K" AND HOUSE IT IN A VARIABLE
        for line in paragraphs:

            if line.alignment == WD_TAB_ALIGNMENT.CENTER and (form_type := line.text.strip().title()) == 'Form 8-K':
                eight_k['FORM_TYPE'] = form_type
def extract_table_image_count(resume, celltext):
    BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    file_path = (os.path.join(BASE_DIR, "media"))
    file_path_for_images = os.path.join(file_path, resume.name)
    doc = docx.Document(resume)
    doc_result = docx2python(file_path_for_images)
    table_count = sum(1 for tab in doc.tables)
    images_count = sum(1 for img in doc_result.images)
    return [table_count, images_count]
Пример #7
0
def get_random_course_fragment_from_pages():
    fragment_path = get_random_filepath_by_path(
        project_root_directory + "\\course_fragments\\pages\\",
        COURSE_FRAGMENTS_EXTENSION)
    document = docx2python(fragment_path)
    text = ""
    for line in document.text.splitlines()[:-4]:
        text = text + '\n' + line
    return text[1:]
Пример #8
0
 def test_empty_properties_dict_if_docProps_not_found(self) -> None:
     """
     It seems Google Docs docx files to not contain a document properties file:
     `docProps/core.xml`. The contents of this file are returned as a dictionary.
     To correct the above error, result.properties will now return an empty
     dictionary.
     """
     result = docx2python(TEST_FILE)
     assert result.properties == {}
Пример #9
0
    def _extract(self) -> str:
        text = ''
        try:
            text = docx2python(self.fpath)

        except Exception as ee:
            print(f'extract_unique_text_from_msword: {ee}')
            pass

        return text
Пример #10
0
    def __init__(self, path: str):
        """
        TODO Docs
        """
        self.sections = []

        raw = docx2python(path).body[0][0][0]

        chunks = self.chunk_raw_body(raw)
        for chunk in chunks[1:]:
            self.sections.append(Section(chunk))
Пример #11
0
    def write_book(self, doc):
        #don't judge me, I didn't make the choice to store the info in a Word doc
        with open("data/book/book.docx", "wb") as file:
            file.seek(0)
            file.write(doc.getvalue())
            file.truncate()

        text = re.sub(self.STRIP_SPACES, "\n",
                      docx2python("data/book/book.docx").text)
        results = re.findall(self.FIND_DATA,
                             text[text.find("This document contains"):])
        return {entry[0]: entry for entry in results}
Пример #12
0
def readWithFormatting(files, inputFolderName, outputFolderName):
    """
    Converts files into .txt
    Maintains formatting as much as possible
    Uses different libraries depending on file types
    """

    # go over every file in list containing all files
    for inputFile in files:
        doc = ''
        # get the full name and the extension of the files
        fileName, fileExtension = os.path.splitext(inputFile)
        # direct the new file name to the output folder by replacing the input folder name to the output folder name
        outputPathFileName = fileName.replace(inputFolderName,
                                              outputFolderName)
        # get the output file path (not including the filename)
        outputPath = os.path.dirname(outputPathFileName)

        if fileExtension == '.docx':
            print('Processing: {}'.format(inputFile))
            # uses docx2python for now, since it's probably easier to keep the formatting consistent later on
            doc = docx2python(inputFile)
            doc = doc.text
            # uses regex to replace repeated new lines into one
            doc = re.sub(r'\n\n*', '\n', doc)

        elif fileExtension == '.doc':
            print('Processing: {}'.format(inputFile))
            doc = textract.process(inputFile).decode("utf-8")
            # uses regex to replace repeated new lines into one newline
            doc = re.sub(r'\n\n*', '\n', doc)

        elif fileExtension == '.pdf':
            print('Processing: {}'.format(inputFile))
            # read in the pdf file as a string
            # this is a bit more complicated so it has its own function
            doc = readPDFFile(inputFile)

        elif fileExtension == '.png' or fileExtension == '.jpeg' or fileExtension == '.jpg':
            print('Processing: {}'.format(inputFile))
            # uses OCR to extract the text
            doc = textract.process(inputFile,
                                   method='tesseract',
                                   language='eng+ind').decode("utf-8")

        if doc:
            # standardise the string (e.g. convert ligatures, other encoding issues)
            doc = doc.translate(LIGATURES)
            # save the string as the file name + .txt
            newFileName = outputPathFileName + '.txt'
            with open(newFileName, 'w') as text_file:
                text_file.write(doc)
Пример #13
0
    def __init__(self, contentsList_path):
        doc_result = docx2python(contentsList_path)
        doc_body_for_dic = doc_result.body[0][0][0]

        pre_data1 = self.remove_blank_and_reg(doc_body_for_dic)
        pre_data2, self.INDEX_LIST = self.find_num_n_remove_Front(pre_data1)
        self.CONTENTS_LIST = self.find_num_n_remove_Back(
            pre_data2)  # 목차 리스트..! -> 키워드로 활용 가능

        # 목차와 index 를 이용하여 사전 생성
        self.DICTIONRAY_LIST = {
            key: value
            for key, value in zip(self.CONTENTS_LIST, self.INDEX_LIST)
        }
Пример #14
0
def DocType(source):
    result = source.filename
    #split filename with (.) to get the file extension
    result_splitted = result.split('.')
    file_extension = result_splitted[-1]
    #check the extension type and use appropriate method to read
    if file_extension == "docx":
        doc = docx2python(source).text
        return doc
    elif file_extension == "txt":
        with open(source) as file:
            doc = file.read()
        return doc
    elif file_extension == "pdf":
        raw = parser.from_file(source)
        doc = raw['content']
        return doc
Пример #15
0
def prepare_text(file_name, main_path):
    # create the path to the file
    fn = os.path.join(main_path, file_name)
    # convert it
    subprocess.call(['C:\Program Files\LibreOffice\program\soffice.exe', '--headless', '--convert-to', 'docx', fn], shell=True)
    file_name = file_name[:-4] + '.docx'
    # read the file
    content = docx2python.docx2python(file_name, extract_image=False)
    # remove the docx after reading
    os.remove(file_name)
    # take the content with the structure it came with
    content = content.body
    #debug_print(content)
    # remove duplicate nodes in the lists, remove duplicate strings at the lowest level
    no_duplicates = removeInnerDups(removeDuplicates(content))
    # debug_print(no_duplicates)
    # remove empty arrs including the sorts of [[[]]]
    removeEmptyArrs(no_duplicates)
    return no_duplicates
Пример #16
0
def use_docx2python(fileName, params):
    ''' gets our title and the place, if theres any'''
    parsed = docx2python(fileName)
    heading = parsed.body
    body = flatten(heading)
    title = ''
    place = ''
    for i in body:
        if '<a href=' in i:
            title += i

    if title:
        title = lxml.html.fromstring(title).text_content()
    else:
       pass

    if '[' in title:
        place += re.split('\[', title)[-1]


    params["headline"] = title
    params["place"] = place[:-1]
Пример #17
0
    def __init__(self, path):
        #path is the location of the file, given when obj is created
        path = path + "\\"
        self.path = path
        self.error = False
        self.errorMsg = '错误提示:' + path
        #find file name to contract and inspection file
        os.chdir(path)
        dirEntry = os.listdir(path)
        contractName = ''
        shipmentName = ''
        for entry in dirEntry:
            if '合同' in entry and '$' not in entry and 'pdf' not in entry and 'jpg' not in entry and 'jpeg' not in entry and '~$' not in entry:
                contractName = entry
            if '发货单' in entry:
                shipmentName = entry
        if len(contractName) == 0 or len(shipmentName) == 0:
            foundFile  = False
            self.error = True
            self.errorMsg += '\n 无法找到合同或者发货单文件,无法录入'
        else:
            foundFile = True
        if foundFile:

            #use docx2txt to find contract name and company name
            contractInPy = docx2txt.process(contractName)
            if '合同编号:' in contractInPy:
                self.contractNum = contractInPy[contractInPy.find('合同编号:') + 5:contractInPy.find('合同编号:') + 12]
            else:
                self.contractNum = 0
                self.error = True
                self.errorMsg += '\n 合同docx文件,无法提取合同编号'

            if '需  方:' in contractInPy and '产品名称' in contractInPy:
                fullnamecomp_re = r"(?<=需  方: )(.*)(?=产品名称)"
                match2 = re.search(fullnamecomp_re, contractInPy, flags=re.DOTALL)
                self.companyFullName = match2[0].strip()
            else:
                self.companyFullName = 0
                self.error = True
                self.errorMsg += '\n 合同docx文件,无法提取需方信息'
            #use docx2txt to fetch info in shipment info file
            shipmentinPy = docx2txt.process(shipmentName)
            if '用户:' in shipmentinPy:
                compnam_re = r"(?<=用户:)([^\s]+)"
                match = re.search(compnam_re, shipmentinPy)
                self.companyName = match[0].strip()
            else:
                self.error = True
                self.companyName = 0
                self.errorMsg += '\n 发货单内无法提取用户简称'

            if '收货单位地址:' in shipmentinPy:
                shipadd_re = r'(?<=收货单位地址:)(.*)(\s)'
                match4 = re.search(shipadd_re, shipmentinPy)
                self.address = match4[0].strip()
            elif '收货地址:' in shipmentinPy:
                shipadd_re = r'(?<=收货地址:)(.*)(\s)'
                match4 = re.search(shipadd_re, shipmentinPy)
                self.address = match4[0].strip()
            else:
                self.error = True
                self.address = 0
                self.errorMsg += '\n发货单内无法提取收货单位地址'
            phone_re = r"(?<=电话:)\s*([0-9]{3}|[0-9]{4})-*[0-9]{4}\s*[0-9]{4}"
            match3 = re.search(phone_re, shipmentinPy)
            if match3 == None:
                self.phone = '无'
            else:
                self.phone = match3[0].strip()


            # function for docx2python, remove empty element from returned list
            def remove_empty(table):
                # remove empty element of list
                return list(filter(lambda x: not isinstance(x, (str, list, tuple)) or x,
                                   (remove_empty(x) if isinstance(x, (tuple, list)) else x for x in table)))


            # use docx2python to generate list and use that list to find price ,model count, and model number
            contractInList = docx2python(path + contractName)
            table =  remove_empty(contractInList.body)
            self.modelNumber = []
            self.modelCount  =[]
            self.price = []

            for row in table[1][1:]:
                if len(row) == 5:

                    self.modelNumber.append(row[1][0])

                    if row[3][0].find("台") == -1:
                        self.modelCount.append(int(row[3][0]))
                    else:
                        self.modelCount.append(int(row[3][0][:row[3][0].find("台")]))

                    if row[2][0].find("元") == -1:
                        self.price.append(int(row[2][0]))
                    else:
                        self.price.append(int(row[2][0][:row[2][0].find("元")]))
Пример #18
0
    def post(self, request):
        nlp = en_core_web_sm.load()
        pf = ProfanityFilter(nlps={'en': nlp})
        # pf.custom_profane_word_dictionaries = {'en': {'sold down the river', 'dog'}}
        # pf.extra_profane_word_dictionaries = {'en': {'sold', 'orange'}}
        wordlist = []
        context = {}

        # FILE UPLOADED
        if 'doc' in request.FILES:

            doc = request.FILES['doc']

            if doc.name.endswith(".docx"):
                docx = docx2python(doc, extract_image=False)
                context['doc'] = docx.text

            elif doc.name.endswith(".txt"):
                print("This is a test")

                mytext = str(doc.read())
                context['doc'] = mytext

            return render(request, 'index.html', context=context)

        # RETRIEVE WORDS AND SPLIT
        document = request.POST['document']
        word_lines = document.splitlines()

        # CHECK EACH WORD IF PROFANITY
        for line in word_lines:
            if line == '':
                wordlist.append(r'\n')

            # NO LINE BREAK CONTINUE HERE
            else:
                words = line.split()
                temp_list = []
                original_list = []

                # LOOP THROUGH EACH WORD.
                for word in words:

                    clean_word = clear_punctuation(word).lower()

                    in_db = Words.objects.all().filter(
                        word__icontains=clean_word)

                    # WORD IS IN DATABASE
                    if in_db:
                        temp_list.append(clean_word)

                        temp_word = " ".join(temp_list)

                        starting_phrase = Words.objects.all().filter(
                            word__istartswith=temp_word)

                        # CURRENT WORD IS THE START OF THE PHRASE
                        if starting_phrase:

                            original_list.append(word)

                            completed = Words.objects.all().filter(
                                word__iexact=temp_word)

                            # CURRENT PHRASE IS COMPLETED
                            if completed:
                                original = " ".join(original_list)
                                original_list.clear()

                                new_word = format_word(original)
                                wordlist.append(new_word)

                                temp_list.clear()

                            # # TEMP WORD DID NOT COMPLETE THE PHRASE
                            # else:
                            #     print('now we here bish')
                            #     original = " ".join(original_list)
                            #     original_list.clear()

                            #     wordlist.append(original)

                            #     temp_list.clear()

                        # NOT START OF PHRASE KEEP GOING
                        else:
                            wordlist.append(word)
                            temp_list.clear()
                            original_list.clear()

                    # WORD IS A PROFANITY
                    elif pf._is_profane_word('en', clean_word):

                        temp_word = " ".join(temp_list)
                        wordlist.append(temp_word)

                        new_word = format_word(word)
                        wordlist.append(new_word)
                        temp_list.clear()

                    # JUST A REGULAR WORD
                    else:
                        temp_word = " ".join(temp_list)
                        wordlist.append(temp_word)

                        wordlist.append(word)

                        temp_list.clear()

        context["results"] = " ".join(wordlist)
        context['document'] = document

        return render(request, 'index.html', context=context)
Пример #19
0
def get_text_d2p(filename):
    result = docx2python(filename)
    string_list = [i for i in flatten_list(result.document)]
    return remove_placeholders(string_list)
Пример #20
0
    print(probs)


def check_for_ftnotes(body):
    found = []
    with open("found_ftnotes.csv", 'r') as f:
        for row in csv.reader(f):
            ftnote_found = row[1]
            found.append(ftnote_found)
    with open("ftnotes.csv", 'r') as orig_f:
        for row in csv.reader(orig_f):
            if len(row) > 1:
                perek, orig_ftnote_marker, orig_ftnote = row
                if orig_ftnote not in found:
                    if len(orig_ftnote) > 2:
                        print(row[1:])
                        pass
                    else:
                        print("Strange case")


bible_sections = [
    ref.normal() for el in library.get_indexes_in_category("Tanakh")
    for ref in library.get_index(el).all_section_refs()
]
start_at = 0
document = docx2python("betulah.docx", html=True)
body, ftnotes = get_body_html(document, {})
body = get_body_insert_ftnotes(body, ftnotes)
#check_for_ftnotes(body)
#post(body)
Пример #21
0
    prev_body = []
    last_ftnote_found = 0
    text = {}
    for (dirpath, dirnames, filenames) in walk(path):
        filenames = [file for file in filenames if file.endswith(".docx") and not "~$" in file]
        filenames = sorted(filenames, key=lambda f: int(re.search("\d+", f).group(0)))
        create_footnotes_indices(dirpath)
        counter = 0
        for f in filenames:
            if "OH" not in f:
                continue
            docx_file = dirpath+"/"+f
            index = library.get_index(dirpath.split("/")[1])
            if not index.title in text:
                text[index.title] = {"Footnotes": {}}
            document = docx2python(docx_file)
            ftnotes = get_footnotes(document)
            header = document.header[0][0][0][0]

            match, prev_beer_match = get_match(header, prev_beer_match, prev_match)
            if match not in text[index.title]:
                text[index.title][match] = []
                text[index.title]["Footnotes"][match] = []
                last_ftnote_found = 0


            body, last_ftnote_found = parse_body(document, index, ftnotes, last_ftnote_found)
            text[index.title][match] += body
            text[index.title]["Footnotes"][match] += ftnotes
            prev_match = match
            prev_body = body
Пример #22
0
def app():

    ##### Set up data for modelling

    # import data to model CV against
    workpac_data = pd.read_csv('job_descriptions.csv')

    # remove id column as we can use index
    workpac_data = workpac_data[['job_link', 'job_description', 'job_title']]

    # clean Na's so functions can run
    workpac_data = workpac_data.replace(np.nan, ' ', regex=True)

    # clean job description
    workpac_data['job_description_cleaned'] = workpac_data[
        'job_description'].apply(_removeNonAscii)
    workpac_data[
        'job_description_cleaned'] = workpac_data.job_description_cleaned.apply(
            func=make_lower_case)
    workpac_data[
        'job_description_cleaned'] = workpac_data.job_description_cleaned.apply(
            func=remove_stop_words)
    workpac_data[
        'job_description_cleaned'] = workpac_data.job_description_cleaned.apply(
            func=remove_punctuation)
    workpac_data[
        'job_description_cleaned'] = workpac_data.job_description_cleaned.apply(
            func=remove_html)

    workpac_data['job_title_cleaned'] = workpac_data['job_title'].apply(
        _removeNonAscii)
    workpac_data['job_title_cleaned'] = workpac_data.job_title_cleaned.apply(
        func=make_lower_case)
    workpac_data['job_title_cleaned'] = workpac_data.job_title_cleaned.apply(
        func=remove_stop_words)
    workpac_data['job_title_cleaned'] = workpac_data.job_title_cleaned.apply(
        func=remove_punctuation)
    workpac_data['job_title_cleaned'] = workpac_data.job_title_cleaned.apply(
        func=remove_html)

    # stem cleaned data
    workpac_data['job_description_cleaned'] = workpac_data[
        'job_description_cleaned'].apply(stem_sentences)
    workpac_data['job_title_cleaned'] = workpac_data[
        'job_title_cleaned'].apply(stem_sentences)

    workpac_data['job_text_cleaned'] = workpac_data[
        'job_title_cleaned'] + ' ' + workpac_data['job_description_cleaned']

    model_df = workpac_data[['job_link', 'job_text_cleaned']]

    # title of streamlit app
    st.title(
        'Candidate Job Ranking System Using Natural Language Processing (NLP)')

    # sidebar
    st.subheader('Files')

    # file uploader
    uploaded_file = st.file_uploader(label='Please upload your CV',
                                     type=['docx'])

    # create message variable
    message = ''
    doc = ''

    if uploaded_file is not None:
        try:
            doc = docx2python(uploaded_file).text
            message = 'File upload successful'
        except Exception as e:
            message = 'There was an error uploading your file'

    # display success/failure message

    st.text(message)

    # display uploaded file (text only as the method used in docx2python is .text)

    try:
        st.text(doc)
    except Exception as e:
        print(e)
        st.write('Please upload CV')

    if doc != '':
        # call recommender function
        recommended_jobs = recommender(model_df, doc)

        # create output dataframe
        recommended_jobs = pd.merge(left=recommended_jobs,
                                    right=workpac_data,
                                    left_on='job_link',
                                    right_on='job_link')

        # summarise output
        recommender_summary = recommended_jobs[['job_title', 'job_link']]

        # display output

        fig = go.Figure(data=[
            go.Table(header=dict(values=list(recommender_summary.columns),
                                 fill_color='paleturquoise',
                                 align='left'),
                     cells=dict(values=[
                         recommender_summary.job_title,
                         recommender_summary.job_link
                     ],
                                fill_color='lavender',
                                align='left'))
        ])

        st.write(fig)
Пример #23
0
"""This script uses the docx2python module, which extracts .docx headers,
footers, text, footnotes, endnotes, properties, and images to a Python object."""

from docx2python import docx2python

#Show file properties
print("Properties:")
print(docx2python('./Lab7.docx').properties)
print('\n')

print("Press enter to continue")
keypress = input()
print('\n')

#Show Header
print("Header:")
print(docx2python('./Lab7.docx').header)
print('\n')

print("Press enter to continue")
keypress = input()
print('\n')

#Show Footer
print("Footer:")
print(docx2python('./Lab7.docx').footer)
print('\n')
Пример #24
0
    new_file_abs = re.sub(r'\.\w+$', '.docx', new_file_abs)
    word.ActiveDocument.SaveAs(new_file_abs, FileFormat=constants.wdFormatXMLDocument)
    doc.Close(False)
    file = file+"x"
    return file, True



path = r"C:\\PATH\\TO\\FILES\\"
for file in os.listdir(path):
    if !file.endswith(".doc") and !file.endswith(".docx"):
        continue
    
    newFileCreated = False
    
    if file.endswith(".doc"):
        file, newFileCreated = save_as_docx(path,file)
    
    filepath = path+file           
    content = docx2python(filepath)           
    lines = list(iter_paragraphs(content.document))
    
    # if a new docx file was created, you want to delete it after getting the contents out
    if newFileCreated == True:
        os.remove(file)
    
    '''
    Do the file processing here
    '''

Пример #25
0
print('hello')
# hi cam
from pathlib import Path
from docx2python import docx2python

word_doc = Path(
    "J:\C04100_C04199\C04147_5_Semple_St_Porirua_Wellington\C04147100_Due_Diligence\007_Work\Reporting\DSI\C04147100R001_FINAL.docx"
)

print(word_doc)
word = docx2python('C04147100R001_FINAL.docx')
text = word.body
exec = text[2]
Пример #26
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from docx2python import docx2python
import pandas as pd

# reading word file
doc_data = docx2python('Carbonemissiondata.docx')

# get separate components of the document
headings = [
    "per capita CO2 (kg per person)", "per capita CO (kg per person)",
    "per capita CH4 (kg per person)"
]

# making our dataframe using pandas
df = pd.DataFrame(doc_data.body[0][1:]). \
    applymap(lambda val: val[0].strip("\t"))

# retrieving original first row (columns headings)
df.columns = [val[0].strip("\t") for val in doc_data.body[0][0]]

# converting columns read from word file to float since It was found that docx 2 returns tables data as string
for i in range(3):
    df[headings[i]] = df[headings[i]].astype(float)

# we will work on col. 2 and 3
X = df.iloc[:, [2, 3]].values

wcss = []
# using elbow method to detect optimum clusters numbers
Пример #27
0
from docx2python import docx2python
from docx import Document
from docx import table

from functions import writeToFile

document1 = docx2python("D:\\PycharmProjects\\docfilefinder\\MASTERRYANLABELFILE.docx")
document2 = docx2python("D:\\PycharmProjects\\docfilefinder\\RYANSTORELABELS.docx")

doc2body = document2.body
body = document1.body
count = 0
doc1list = dict()
for innerbody in body:
    for row in innerbody:
        if row[0][0] == "":
            continue
        doc1list[row[0][0]] = row[0]
        doc1list[row[2][0]] = row[2]
        doc1list[row[4][0]] = row
        count += 1
print("Document 1: {MASTERRYANLABELFILE} List", len(doc1list), "Count : ", count, "Check", count * 3)

# for Document 2

count = 0
doc2list = dict()
for innerbody in doc2body:
    for row in innerbody:
        if row[0][0] == "":
            continue
Пример #28
0
def loadDocx(doc):
    return docx2python(doc)
Пример #29
0
def extract_image(resume):
    result = docx2python(resume)
    count = 0
    for name, image in result.images.items():
        count += 1
    return count
Пример #30
0
from docx2python import docx2python
from XmlConverterAPI import Structure, Converter
import re

file = docx2python('Files/ForDocx.docx')
list = []

pattern = '\[|\]|\''
regex = re.compile(r'{}'.format(pattern))

doc = str(file.document).split("',")

for line in doc:
    line = regex.sub('', line)

    if '.' not in line and (line.count(' ')) <= 9:
        list.append(Structure(name=Converter.BLOCK, value=line))
    else:
        list.append(Structure(name=Converter.TEXT, value=line))

Converter.createXML(list, "FromDocx_XML.xml")