def make_jsonFile(dic_path, doc_path, file_name): igate_cl = generate_Dictionary(dic_path) # 목차를 이용하여 사전 생성 index_dictionary = igate_cl.DICTIONRAY_LIST json_file = [] # 문서 로드 doc_result = docx2python(doc_path) main_title = '' sub_title = '' title = '' index_len = '' for j in range(1, len(doc_result.body)): doc_body = doc_result.body[j] doc_len = len(doc_body) if doc_len == 1: # 문서 스플릿 doc_list = generate_doc(doc_body, doc_len, index_dictionary) print(doc_list) for _, content in enumerate(doc_list): #print(content) for text in content: if text in index_dictionary: #print(text) index = index_dictionary[text] # index는 해당 목차(사전)의 번호 index_len = len(index) # 1: 대분류 / 2: 중분류 / 3: 소분류 if index_len == 1: main_title = text sub_title = text title = text elif index_len == 2: sub_title = text title = text elif index_len == 3: title = text json_data = generate_doc_to_json(index_len, content, main_title, sub_title, title, "string") json_file.append(json_data) else: print('ID = {0}'.format(str(j))) # 테이블 table = generate_table(doc_body, doc_len) #print(table) json_data = generate_doc_to_json(index_len, table, main_title, sub_title, title, "table") json_file.append(json_data) with open(path.RESULT_DATA_PATH + file_name + '.json', 'w', encoding='utf-8') as make_file: json.dump(json_file, make_file, ensure_ascii=False, indent="\t")
def docx2text(self,filename): """ :param filename: docx源文件 :return: docx的文字内容 """ def flatten(S): """ 展平嵌套列表 :param S: 嵌套列表 :return: 单个不嵌套的列表 """ if S == []: return S if isinstance(S[0], list): return flatten(S[0]) + flatten(S[1:]) return S[:1] + flatten(S[1:]) if filename.split('.')[-1] == "docx": # 提取文本 doc_result = docx2python(filename) # 展开结果 res = flatten(doc_result.body) # 去除空格 res = [r for r in res if r.strip()] # 返回成原来格式 content = '。'.join(res) elif filename.split('.')[-1] == "doc": content = subprocess.check_output(['antiword', filename]) content = content.decode('utf-8') return content
def main(path): print(path) real_path = path if path.endswith(".doc"): word = wc.Dispatch('Word.Application') doc = word.Documents.Open(path) doc.SaveAs(path + "x", 12, False, "", True, "", False, False, False, False) doc.Close() word.Quit() real_path = path + "x" elif not path.endswith(".docx"): print("请检查文件后缀名是否有效!") return keyword = '参考文献' file = docx2python(real_path) temp = file.text.split('\n') content = [] for i in range(len(temp)): if i % 2 == 0: content.append(temp[i]) # 可以输出reference看看,已经是text了 reference = extract(keyword, content) check(reference)
def post(self, *args, **kwargs): uploaded_file = self.request.FILES['document'] document = docx2python(uploaded_file, self.img_path) os.chdir(self.img_path) for f in os.listdir(self.img_path): if f.endswith('.wmf') or f.endswith('.emf') or f.endswith('.jpeg'): i = Image.open(f) fn, fext = os.path.splitext(f) i.save('{}.png'.format(fn)) document_content = document.body z = document_content[0][0][0] composite_list = [z[x:x + 8] for x in range(0, len(z), 8)] for row in composite_list: try: data = Questions.objects.create(subject=row[0], question=row[1], ques_image=row[2], option_a=row[3], option_b=row[4], option_c=row[5], option_d=row[6], ans=row[7]) except Exception as e: print(e) messages.success(self.request, 'Data uploded successfully') return redirect("admin/")
def parse_word(self, file_location): #Define variables combo_filing = [] exhibits = [] emerging_growth = [] period = [] items = [] eight_k = {} # CREATE WORD DOCUMENT OBJECT try: doc = docx.Document(file_location) docs = docx2python(file_location, extract_image=True) except: print('Only ".DOCX" files are excepted.') return None # CREATE PARAGRAPH OBJECT paragraphs = doc.paragraphs # LOOP THROUGH LINES OF PARAGRAPH. FIND CENTER ALIGNED TEXT THAT EQUAL "FORM 8-K" AND HOUSE IT IN A VARIABLE for line in paragraphs: if line.alignment == WD_TAB_ALIGNMENT.CENTER and (form_type := line.text.strip().title()) == 'Form 8-K': eight_k['FORM_TYPE'] = form_type
def extract_table_image_count(resume, celltext): BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) file_path = (os.path.join(BASE_DIR, "media")) file_path_for_images = os.path.join(file_path, resume.name) doc = docx.Document(resume) doc_result = docx2python(file_path_for_images) table_count = sum(1 for tab in doc.tables) images_count = sum(1 for img in doc_result.images) return [table_count, images_count]
def get_random_course_fragment_from_pages(): fragment_path = get_random_filepath_by_path( project_root_directory + "\\course_fragments\\pages\\", COURSE_FRAGMENTS_EXTENSION) document = docx2python(fragment_path) text = "" for line in document.text.splitlines()[:-4]: text = text + '\n' + line return text[1:]
def test_empty_properties_dict_if_docProps_not_found(self) -> None: """ It seems Google Docs docx files to not contain a document properties file: `docProps/core.xml`. The contents of this file are returned as a dictionary. To correct the above error, result.properties will now return an empty dictionary. """ result = docx2python(TEST_FILE) assert result.properties == {}
def _extract(self) -> str: text = '' try: text = docx2python(self.fpath) except Exception as ee: print(f'extract_unique_text_from_msword: {ee}') pass return text
def __init__(self, path: str): """ TODO Docs """ self.sections = [] raw = docx2python(path).body[0][0][0] chunks = self.chunk_raw_body(raw) for chunk in chunks[1:]: self.sections.append(Section(chunk))
def write_book(self, doc): #don't judge me, I didn't make the choice to store the info in a Word doc with open("data/book/book.docx", "wb") as file: file.seek(0) file.write(doc.getvalue()) file.truncate() text = re.sub(self.STRIP_SPACES, "\n", docx2python("data/book/book.docx").text) results = re.findall(self.FIND_DATA, text[text.find("This document contains"):]) return {entry[0]: entry for entry in results}
def readWithFormatting(files, inputFolderName, outputFolderName): """ Converts files into .txt Maintains formatting as much as possible Uses different libraries depending on file types """ # go over every file in list containing all files for inputFile in files: doc = '' # get the full name and the extension of the files fileName, fileExtension = os.path.splitext(inputFile) # direct the new file name to the output folder by replacing the input folder name to the output folder name outputPathFileName = fileName.replace(inputFolderName, outputFolderName) # get the output file path (not including the filename) outputPath = os.path.dirname(outputPathFileName) if fileExtension == '.docx': print('Processing: {}'.format(inputFile)) # uses docx2python for now, since it's probably easier to keep the formatting consistent later on doc = docx2python(inputFile) doc = doc.text # uses regex to replace repeated new lines into one doc = re.sub(r'\n\n*', '\n', doc) elif fileExtension == '.doc': print('Processing: {}'.format(inputFile)) doc = textract.process(inputFile).decode("utf-8") # uses regex to replace repeated new lines into one newline doc = re.sub(r'\n\n*', '\n', doc) elif fileExtension == '.pdf': print('Processing: {}'.format(inputFile)) # read in the pdf file as a string # this is a bit more complicated so it has its own function doc = readPDFFile(inputFile) elif fileExtension == '.png' or fileExtension == '.jpeg' or fileExtension == '.jpg': print('Processing: {}'.format(inputFile)) # uses OCR to extract the text doc = textract.process(inputFile, method='tesseract', language='eng+ind').decode("utf-8") if doc: # standardise the string (e.g. convert ligatures, other encoding issues) doc = doc.translate(LIGATURES) # save the string as the file name + .txt newFileName = outputPathFileName + '.txt' with open(newFileName, 'w') as text_file: text_file.write(doc)
def __init__(self, contentsList_path): doc_result = docx2python(contentsList_path) doc_body_for_dic = doc_result.body[0][0][0] pre_data1 = self.remove_blank_and_reg(doc_body_for_dic) pre_data2, self.INDEX_LIST = self.find_num_n_remove_Front(pre_data1) self.CONTENTS_LIST = self.find_num_n_remove_Back( pre_data2) # 목차 리스트..! -> 키워드로 활용 가능 # 목차와 index 를 이용하여 사전 생성 self.DICTIONRAY_LIST = { key: value for key, value in zip(self.CONTENTS_LIST, self.INDEX_LIST) }
def DocType(source): result = source.filename #split filename with (.) to get the file extension result_splitted = result.split('.') file_extension = result_splitted[-1] #check the extension type and use appropriate method to read if file_extension == "docx": doc = docx2python(source).text return doc elif file_extension == "txt": with open(source) as file: doc = file.read() return doc elif file_extension == "pdf": raw = parser.from_file(source) doc = raw['content'] return doc
def prepare_text(file_name, main_path): # create the path to the file fn = os.path.join(main_path, file_name) # convert it subprocess.call(['C:\Program Files\LibreOffice\program\soffice.exe', '--headless', '--convert-to', 'docx', fn], shell=True) file_name = file_name[:-4] + '.docx' # read the file content = docx2python.docx2python(file_name, extract_image=False) # remove the docx after reading os.remove(file_name) # take the content with the structure it came with content = content.body #debug_print(content) # remove duplicate nodes in the lists, remove duplicate strings at the lowest level no_duplicates = removeInnerDups(removeDuplicates(content)) # debug_print(no_duplicates) # remove empty arrs including the sorts of [[[]]] removeEmptyArrs(no_duplicates) return no_duplicates
def use_docx2python(fileName, params): ''' gets our title and the place, if theres any''' parsed = docx2python(fileName) heading = parsed.body body = flatten(heading) title = '' place = '' for i in body: if '<a href=' in i: title += i if title: title = lxml.html.fromstring(title).text_content() else: pass if '[' in title: place += re.split('\[', title)[-1] params["headline"] = title params["place"] = place[:-1]
def __init__(self, path): #path is the location of the file, given when obj is created path = path + "\\" self.path = path self.error = False self.errorMsg = '错误提示:' + path #find file name to contract and inspection file os.chdir(path) dirEntry = os.listdir(path) contractName = '' shipmentName = '' for entry in dirEntry: if '合同' in entry and '$' not in entry and 'pdf' not in entry and 'jpg' not in entry and 'jpeg' not in entry and '~$' not in entry: contractName = entry if '发货单' in entry: shipmentName = entry if len(contractName) == 0 or len(shipmentName) == 0: foundFile = False self.error = True self.errorMsg += '\n 无法找到合同或者发货单文件,无法录入' else: foundFile = True if foundFile: #use docx2txt to find contract name and company name contractInPy = docx2txt.process(contractName) if '合同编号:' in contractInPy: self.contractNum = contractInPy[contractInPy.find('合同编号:') + 5:contractInPy.find('合同编号:') + 12] else: self.contractNum = 0 self.error = True self.errorMsg += '\n 合同docx文件,无法提取合同编号' if '需 方:' in contractInPy and '产品名称' in contractInPy: fullnamecomp_re = r"(?<=需 方: )(.*)(?=产品名称)" match2 = re.search(fullnamecomp_re, contractInPy, flags=re.DOTALL) self.companyFullName = match2[0].strip() else: self.companyFullName = 0 self.error = True self.errorMsg += '\n 合同docx文件,无法提取需方信息' #use docx2txt to fetch info in shipment info file shipmentinPy = docx2txt.process(shipmentName) if '用户:' in shipmentinPy: compnam_re = r"(?<=用户:)([^\s]+)" match = re.search(compnam_re, shipmentinPy) self.companyName = match[0].strip() else: self.error = True self.companyName = 0 self.errorMsg += '\n 发货单内无法提取用户简称' if '收货单位地址:' in shipmentinPy: shipadd_re = r'(?<=收货单位地址:)(.*)(\s)' match4 = re.search(shipadd_re, shipmentinPy) self.address = match4[0].strip() elif '收货地址:' in shipmentinPy: shipadd_re = r'(?<=收货地址:)(.*)(\s)' match4 = re.search(shipadd_re, shipmentinPy) self.address = match4[0].strip() else: self.error = True self.address = 0 self.errorMsg += '\n发货单内无法提取收货单位地址' phone_re = r"(?<=电话:)\s*([0-9]{3}|[0-9]{4})-*[0-9]{4}\s*[0-9]{4}" match3 = re.search(phone_re, shipmentinPy) if match3 == None: self.phone = '无' else: self.phone = match3[0].strip() # function for docx2python, remove empty element from returned list def remove_empty(table): # remove empty element of list return list(filter(lambda x: not isinstance(x, (str, list, tuple)) or x, (remove_empty(x) if isinstance(x, (tuple, list)) else x for x in table))) # use docx2python to generate list and use that list to find price ,model count, and model number contractInList = docx2python(path + contractName) table = remove_empty(contractInList.body) self.modelNumber = [] self.modelCount =[] self.price = [] for row in table[1][1:]: if len(row) == 5: self.modelNumber.append(row[1][0]) if row[3][0].find("台") == -1: self.modelCount.append(int(row[3][0])) else: self.modelCount.append(int(row[3][0][:row[3][0].find("台")])) if row[2][0].find("元") == -1: self.price.append(int(row[2][0])) else: self.price.append(int(row[2][0][:row[2][0].find("元")]))
def post(self, request): nlp = en_core_web_sm.load() pf = ProfanityFilter(nlps={'en': nlp}) # pf.custom_profane_word_dictionaries = {'en': {'sold down the river', 'dog'}} # pf.extra_profane_word_dictionaries = {'en': {'sold', 'orange'}} wordlist = [] context = {} # FILE UPLOADED if 'doc' in request.FILES: doc = request.FILES['doc'] if doc.name.endswith(".docx"): docx = docx2python(doc, extract_image=False) context['doc'] = docx.text elif doc.name.endswith(".txt"): print("This is a test") mytext = str(doc.read()) context['doc'] = mytext return render(request, 'index.html', context=context) # RETRIEVE WORDS AND SPLIT document = request.POST['document'] word_lines = document.splitlines() # CHECK EACH WORD IF PROFANITY for line in word_lines: if line == '': wordlist.append(r'\n') # NO LINE BREAK CONTINUE HERE else: words = line.split() temp_list = [] original_list = [] # LOOP THROUGH EACH WORD. for word in words: clean_word = clear_punctuation(word).lower() in_db = Words.objects.all().filter( word__icontains=clean_word) # WORD IS IN DATABASE if in_db: temp_list.append(clean_word) temp_word = " ".join(temp_list) starting_phrase = Words.objects.all().filter( word__istartswith=temp_word) # CURRENT WORD IS THE START OF THE PHRASE if starting_phrase: original_list.append(word) completed = Words.objects.all().filter( word__iexact=temp_word) # CURRENT PHRASE IS COMPLETED if completed: original = " ".join(original_list) original_list.clear() new_word = format_word(original) wordlist.append(new_word) temp_list.clear() # # TEMP WORD DID NOT COMPLETE THE PHRASE # else: # print('now we here bish') # original = " ".join(original_list) # original_list.clear() # wordlist.append(original) # temp_list.clear() # NOT START OF PHRASE KEEP GOING else: wordlist.append(word) temp_list.clear() original_list.clear() # WORD IS A PROFANITY elif pf._is_profane_word('en', clean_word): temp_word = " ".join(temp_list) wordlist.append(temp_word) new_word = format_word(word) wordlist.append(new_word) temp_list.clear() # JUST A REGULAR WORD else: temp_word = " ".join(temp_list) wordlist.append(temp_word) wordlist.append(word) temp_list.clear() context["results"] = " ".join(wordlist) context['document'] = document return render(request, 'index.html', context=context)
def get_text_d2p(filename): result = docx2python(filename) string_list = [i for i in flatten_list(result.document)] return remove_placeholders(string_list)
print(probs) def check_for_ftnotes(body): found = [] with open("found_ftnotes.csv", 'r') as f: for row in csv.reader(f): ftnote_found = row[1] found.append(ftnote_found) with open("ftnotes.csv", 'r') as orig_f: for row in csv.reader(orig_f): if len(row) > 1: perek, orig_ftnote_marker, orig_ftnote = row if orig_ftnote not in found: if len(orig_ftnote) > 2: print(row[1:]) pass else: print("Strange case") bible_sections = [ ref.normal() for el in library.get_indexes_in_category("Tanakh") for ref in library.get_index(el).all_section_refs() ] start_at = 0 document = docx2python("betulah.docx", html=True) body, ftnotes = get_body_html(document, {}) body = get_body_insert_ftnotes(body, ftnotes) #check_for_ftnotes(body) #post(body)
prev_body = [] last_ftnote_found = 0 text = {} for (dirpath, dirnames, filenames) in walk(path): filenames = [file for file in filenames if file.endswith(".docx") and not "~$" in file] filenames = sorted(filenames, key=lambda f: int(re.search("\d+", f).group(0))) create_footnotes_indices(dirpath) counter = 0 for f in filenames: if "OH" not in f: continue docx_file = dirpath+"/"+f index = library.get_index(dirpath.split("/")[1]) if not index.title in text: text[index.title] = {"Footnotes": {}} document = docx2python(docx_file) ftnotes = get_footnotes(document) header = document.header[0][0][0][0] match, prev_beer_match = get_match(header, prev_beer_match, prev_match) if match not in text[index.title]: text[index.title][match] = [] text[index.title]["Footnotes"][match] = [] last_ftnote_found = 0 body, last_ftnote_found = parse_body(document, index, ftnotes, last_ftnote_found) text[index.title][match] += body text[index.title]["Footnotes"][match] += ftnotes prev_match = match prev_body = body
def app(): ##### Set up data for modelling # import data to model CV against workpac_data = pd.read_csv('job_descriptions.csv') # remove id column as we can use index workpac_data = workpac_data[['job_link', 'job_description', 'job_title']] # clean Na's so functions can run workpac_data = workpac_data.replace(np.nan, ' ', regex=True) # clean job description workpac_data['job_description_cleaned'] = workpac_data[ 'job_description'].apply(_removeNonAscii) workpac_data[ 'job_description_cleaned'] = workpac_data.job_description_cleaned.apply( func=make_lower_case) workpac_data[ 'job_description_cleaned'] = workpac_data.job_description_cleaned.apply( func=remove_stop_words) workpac_data[ 'job_description_cleaned'] = workpac_data.job_description_cleaned.apply( func=remove_punctuation) workpac_data[ 'job_description_cleaned'] = workpac_data.job_description_cleaned.apply( func=remove_html) workpac_data['job_title_cleaned'] = workpac_data['job_title'].apply( _removeNonAscii) workpac_data['job_title_cleaned'] = workpac_data.job_title_cleaned.apply( func=make_lower_case) workpac_data['job_title_cleaned'] = workpac_data.job_title_cleaned.apply( func=remove_stop_words) workpac_data['job_title_cleaned'] = workpac_data.job_title_cleaned.apply( func=remove_punctuation) workpac_data['job_title_cleaned'] = workpac_data.job_title_cleaned.apply( func=remove_html) # stem cleaned data workpac_data['job_description_cleaned'] = workpac_data[ 'job_description_cleaned'].apply(stem_sentences) workpac_data['job_title_cleaned'] = workpac_data[ 'job_title_cleaned'].apply(stem_sentences) workpac_data['job_text_cleaned'] = workpac_data[ 'job_title_cleaned'] + ' ' + workpac_data['job_description_cleaned'] model_df = workpac_data[['job_link', 'job_text_cleaned']] # title of streamlit app st.title( 'Candidate Job Ranking System Using Natural Language Processing (NLP)') # sidebar st.subheader('Files') # file uploader uploaded_file = st.file_uploader(label='Please upload your CV', type=['docx']) # create message variable message = '' doc = '' if uploaded_file is not None: try: doc = docx2python(uploaded_file).text message = 'File upload successful' except Exception as e: message = 'There was an error uploading your file' # display success/failure message st.text(message) # display uploaded file (text only as the method used in docx2python is .text) try: st.text(doc) except Exception as e: print(e) st.write('Please upload CV') if doc != '': # call recommender function recommended_jobs = recommender(model_df, doc) # create output dataframe recommended_jobs = pd.merge(left=recommended_jobs, right=workpac_data, left_on='job_link', right_on='job_link') # summarise output recommender_summary = recommended_jobs[['job_title', 'job_link']] # display output fig = go.Figure(data=[ go.Table(header=dict(values=list(recommender_summary.columns), fill_color='paleturquoise', align='left'), cells=dict(values=[ recommender_summary.job_title, recommender_summary.job_link ], fill_color='lavender', align='left')) ]) st.write(fig)
"""This script uses the docx2python module, which extracts .docx headers, footers, text, footnotes, endnotes, properties, and images to a Python object.""" from docx2python import docx2python #Show file properties print("Properties:") print(docx2python('./Lab7.docx').properties) print('\n') print("Press enter to continue") keypress = input() print('\n') #Show Header print("Header:") print(docx2python('./Lab7.docx').header) print('\n') print("Press enter to continue") keypress = input() print('\n') #Show Footer print("Footer:") print(docx2python('./Lab7.docx').footer) print('\n')
new_file_abs = re.sub(r'\.\w+$', '.docx', new_file_abs) word.ActiveDocument.SaveAs(new_file_abs, FileFormat=constants.wdFormatXMLDocument) doc.Close(False) file = file+"x" return file, True path = r"C:\\PATH\\TO\\FILES\\" for file in os.listdir(path): if !file.endswith(".doc") and !file.endswith(".docx"): continue newFileCreated = False if file.endswith(".doc"): file, newFileCreated = save_as_docx(path,file) filepath = path+file content = docx2python(filepath) lines = list(iter_paragraphs(content.document)) # if a new docx file was created, you want to delete it after getting the contents out if newFileCreated == True: os.remove(file) ''' Do the file processing here '''
print('hello') # hi cam from pathlib import Path from docx2python import docx2python word_doc = Path( "J:\C04100_C04199\C04147_5_Semple_St_Porirua_Wellington\C04147100_Due_Diligence\007_Work\Reporting\DSI\C04147100R001_FINAL.docx" ) print(word_doc) word = docx2python('C04147100R001_FINAL.docx') text = word.body exec = text[2]
import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import KMeans from docx2python import docx2python import pandas as pd # reading word file doc_data = docx2python('Carbonemissiondata.docx') # get separate components of the document headings = [ "per capita CO2 (kg per person)", "per capita CO (kg per person)", "per capita CH4 (kg per person)" ] # making our dataframe using pandas df = pd.DataFrame(doc_data.body[0][1:]). \ applymap(lambda val: val[0].strip("\t")) # retrieving original first row (columns headings) df.columns = [val[0].strip("\t") for val in doc_data.body[0][0]] # converting columns read from word file to float since It was found that docx 2 returns tables data as string for i in range(3): df[headings[i]] = df[headings[i]].astype(float) # we will work on col. 2 and 3 X = df.iloc[:, [2, 3]].values wcss = [] # using elbow method to detect optimum clusters numbers
from docx2python import docx2python from docx import Document from docx import table from functions import writeToFile document1 = docx2python("D:\\PycharmProjects\\docfilefinder\\MASTERRYANLABELFILE.docx") document2 = docx2python("D:\\PycharmProjects\\docfilefinder\\RYANSTORELABELS.docx") doc2body = document2.body body = document1.body count = 0 doc1list = dict() for innerbody in body: for row in innerbody: if row[0][0] == "": continue doc1list[row[0][0]] = row[0] doc1list[row[2][0]] = row[2] doc1list[row[4][0]] = row count += 1 print("Document 1: {MASTERRYANLABELFILE} List", len(doc1list), "Count : ", count, "Check", count * 3) # for Document 2 count = 0 doc2list = dict() for innerbody in doc2body: for row in innerbody: if row[0][0] == "": continue
def loadDocx(doc): return docx2python(doc)
def extract_image(resume): result = docx2python(resume) count = 0 for name, image in result.images.items(): count += 1 return count
from docx2python import docx2python from XmlConverterAPI import Structure, Converter import re file = docx2python('Files/ForDocx.docx') list = [] pattern = '\[|\]|\'' regex = re.compile(r'{}'.format(pattern)) doc = str(file.document).split("',") for line in doc: line = regex.sub('', line) if '.' not in line and (line.count(' ')) <= 9: list.append(Structure(name=Converter.BLOCK, value=line)) else: list.append(Structure(name=Converter.TEXT, value=line)) Converter.createXML(list, "FromDocx_XML.xml")