def flag_private_urls(doc): '''This function takes a word document name as an input, identifies hyperlinks and their urls that do not fit the criteria of being in a newsletter. The function returns a table with columns 'Link_Text','Likely_Personal_URL?', and 'URL' with all the URLs within the document, sorted by the likeliness of being a personal link.''' #read in document #file = f'{doc}.docx' #create DOCReader object doc = docxpy.DOCReader(doc) #process file doc.process() #extract hyperlinks hyperlinks = doc.data['links'] # create DataFrame using hyperlinks object df = pd.DataFrame(hyperlinks, columns=['Link_Text', 'URL']) #the 'text' column is byte type, convert to string type df['Link_Text'] = df['Link_Text'].str.decode("utf-8") #initiate a list of words to filter the dataframe, and return the result. This list will soon grow once we are able to obtain more examples of private urls. words_to_filter = ["personal", ':p:/t'] df['Likely_Private_URL?'] = df.iloc[:, 1].str.contains( r'\b(?:{})\b'.format('|'.join(words_to_filter))) df = df.sort_values(by='Likely_Private_URL?', ascending=False) result = df[['Link_Text', 'Likely_Private_URL?', 'URL']] result = df.loc[df['Likely_Private_URL?'] == True] result = result.reset_index() return result
def spot(filename, listWords): extirper = filename.split(".")[-1] if extirper == "docx": try: doc = docxpy.DOCReader(filename) doc.process() text = doc.data['document'].replace('\n', '') with open("CVs/data.txt", "w") as fichier: for line in text: fichier.write(line) searchText(listWords) except: print(f"erreur de lecture du fichier {filename}") ###### elif extirper == "pdf": try: pdfFileObj = open(filename, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) num_pages = pdfReader.numPages count = 0 text = "" while count < num_pages: pageObj = pdfReader.getPage(count) count += 1 text += pageObj.extractText() if text != "": text = text else: text = textract.process(fileurl, method='tesseract', language='eng') with open("CVs/data.txt", "w") as fichier: for line in text: fichier.write(line) searchText(listWords) except: print(f"erreur de lecture du fichier {filename}") ###### ###### elif extirper == "html": with open(filename, "r") as rfichier: text = rfichier.read() with open("CVs/data.txt", "w") as fichier: for line in text: fichier.write(line) searchText(listWords) else: print("extention de fichier non gérée")
def flag_private_urls_to_dict(filename): '''This function takes a word document name as an input, identifies hyperlinks and their urls that do not fit the criteria of being in a newsletter. The function returns a table with columns 'Link_Text','Likely_Personal_URL?', and 'URL' with all the URLs within the document, sorted by the likeliness of being a personal link.''' #read in document #file = f'{doc}.docx' #create DOCReader object doc = docxpy.DOCReader(filename) #process file doc.process() #extract hyperlinks hyperlinks = doc.data['links'] # create DataFrame using hyperlinks object df = pd.DataFrame(hyperlinks, columns=['Link_Text', 'URL']) #the 'text' column is byte type, convert to string type df['Link_Text'] = df['Link_Text'].str.decode("utf-8") #initiate a list of words to filter the dataframe, and return the result. This list will soon grow once we are able to obtain more examples of private urls. words_to_filter = ["personal", ':p:/t'] df['Likely_Private_URL?'] = df.iloc[:, 1].str.contains( r'\b(?:{})\b'.format('|'.join(words_to_filter))) df = df.sort_values(by='Likely_Private_URL?', ascending=False) #add name of document as a column for the excel file in sharepoint df['Article_Name'] = f'{filename}' df = df[['Article_Name', 'Link_Text', 'Likely_Private_URL?', 'URL']] df = df.loc[df['Likely_Private_URL?'] == True] if df.shape[0] == 0: result = { "Article_Name": f'{filename}', "Link_Text": 'No Bad Links Detected', "Likely_Private_URL": False, 'URL': 'No Bad Links Detected' } elif df.shape[0] > 0: for i, row in df.iterrows(): result = { "Article_Name": row['Article_Name'], "Link_Text": row['Link_Text'], "Likely_Private_URL": row['Likely_Private_URL?'], 'URL': row['URL'] } return result
def printtext(file): file_name, file_extension = os.path.splitext(file) junk, filenamenopath = os.path.split(file_name) filename = file_name + ".txt" if (file_extension == ".docx"): text = docxpy.process(file) doc = docxpy.DOCReader(file) doc.process() # process file elif (file_extension == ".pdf"): text = extract_text(file) elif (file_extension == ".txt"): text = open(file, "r").read() else: text = "I don't know this file type" text = re.sub('[^A-Za-z0-9\s]+', '', text) assignmentfile = open("../assignment.txt", "w") assignmentfile.write(text) #print text return text
import docxpy file = 'my_word_file.docx' # extract text text = docxpy.process(file) print(text) # extract text and write images in /tmp/img_dir #text = docxpy.process(file, "/tmp/img_dir") # if you want the hyperlinks doc = docxpy.DOCReader(file) doc.process() # process file hyperlinks = doc.data['links'] print(hyperlinks)
j = 0 k = 1 j += k a = [] b = [] hypertext = "" left = 0 for i in range(0, 228): try: zee = r'C:\Users\ahmad\Desktop\Projects\wordtocsv\New folder\1 ({}).docx'.format( j) print(zee) docpx = docxpy.DOCReader(zee) docpx.process() # process file hyperlinks = docpx.data['links'] for z in hyperlinks: z1 = z[1] z2 = z[0].decode('utf-8') hypertext += " {} {} , ".format(z2, z1) hyper = hypertext doc = docx.Document(zee) tables = doc.tables for table in tables: for row in table.rows: for cell in row.cells: d = cell.text a.append(d) # print (cell.text)
def read_docx_files(): files = glob.glob(download_path + "\\DocxFiles\\DP*") os.chdir(download_path) for file in files: if os.path.exists(download_path + "\\TestTemplates\\" + os.path.basename(file)): os.remove(download_path + "\\TestTemplates\\" + os.path.basename(file)) docx_handler = docx.Document(file) docx_tables = docx_handler.tables docx_hyperlink_handler = docxpy.DOCReader(file) docx_hyperlink_handler.process() hyperlinks = docx_hyperlink_handler.data['links'] test_scenario_hyperlink_text = str(hyperlinks[0][0])[2:-1] jira_id = docx_tables[0].rows[0].cells[0].text jira_test_id = jira_id.split()[0] jira_test_desc = "" for i in range(len(docx_tables)): if "Description" in docx_tables[i].rows[0].cells[0].text: jira_test_desc = docx_tables[i + 1].rows[0].cells[0].text.strip() break zephyr_tests = None for x in range(len(docx_tables[2].rows)): if "Zephyr" in docx_tables[2].rows[x].cells[0].text: zephyr_tests = docx_tables[2].rows[x].cells[1] break if not zephyr_tests: # Move test without steps to separate dir if not os.path.exists(download_path + "\\DocxFiles\\TestsWithoutSteps"): os.makedirs(download_path + "\\DocxFiles\\TestsWithoutSteps") shutil.move( download_path + "\\DocxFiles\\" + os.path.basename(file), download_path + "\\DocxFiles\\TestsWithoutSteps\\" + os.path.basename(file)) continue zephyr_tests_table = zephyr_tests.tables zephyr_rows = zephyr_tests_table[0].rows # get row id's zephyr_rows = zephyr_rows[ 1:] # remove first cell from all rows (e.g. "Test Step", "Test Data", etc.) # Test Steps list_of_test_steps = [] for row in zephyr_rows: list_of_test_steps.append(row.cells[1].text) # Test Conditions list_of_test_conditions = [] for row in zephyr_rows: list_of_test_conditions.append(row.cells[2].text) # Expected results list_of_exptected_results = [] for row in zephyr_rows: list_of_exptected_results.append(row.cells[3].text) number_of_teststeps = zephyr_tests_table[0].rows[-1].cells[0].text file_save_path = download_path + "\\TestTemplates\\" + os.path.basename( file) final_docx_template = docx.Document( str(script_path) + "\\SampleTestScripts1.docx") final_docx_table = final_docx_template.tables font = final_docx_template.styles['Normal'].font font.name = 'Calibri' paragraph = final_docx_template.styles['Normal'].paragraph_format paragraph.space_after = Pt(3) paragraph.left_indent = Pt(0) heading1 = final_docx_template.styles['Heading 1'].paragraph_format heading1.space_before = Pt(0) for x in range(1, int(number_of_teststeps)): final_docx_table[0].add_row() os.chdir(download_path) test_id = final_docx_table[0].rows[0].cells[2].paragraphs[0] test_id.add_run(jira_test_id) test_scenario = final_docx_table[0].rows[1].cells[2].paragraphs[0] test_scenario.add_run(test_scenario_hyperlink_text) final_docx_table[0].rows[1].cells[2].paragraphs[ 0].paragraph_format.left_indent = Pt(0) test_description = final_docx_table[0].rows[2].cells[2].paragraphs[0] test_description.add_run(jira_test_desc) steps_only_table = final_docx_table[0].rows[4:] # steps_only_table[0].cells[0].paragraphs[0].style = rws_template.styles['Normal'] for x in range(0, int(number_of_teststeps)): steps_only_table[x].cells[0].paragraphs[ 0].paragraph_format.left_indent = Pt(12) for x in range(0, int(number_of_teststeps)): steps_only_table[x].cells[0].paragraphs[0].add_run( str(x + 1) + ".") for x in range(0, int(number_of_teststeps)): steps_only_table[x].cells[1].paragraphs[0].add_run( list_of_test_steps[x]) for x in range(0, int(number_of_teststeps)): steps_only_table[x].cells[2].paragraphs[0].add_run( list_of_test_conditions[x]) for x in range(0, int(number_of_teststeps)): steps_only_table[x].cells[3].paragraphs[0].add_run( list_of_exptected_results[x]) # final_docx_template.add_page_break() final_docx_template.save(file_save_path)
import docxpy import os doc = docxpy.DOCReader('Demo_WORD.docx') doc.process() linkextracts = doc.data['links'] #doc.data.keys() --> Since doc.data is of type dict #['header', 'document', 'links', 'footer'] for i in doc.links.values(): print(i) ''' Alternate-method #iterating a 2D list. In this case, it's a list of tuples links = [] for i,j in linkextracts: links.append(j) print(len(links)) #printing out the list called links for k in links: print(k) '''
def spot(filename, listWords): """ The fonction spot return a number of occur of searched words ; listWords in this fonction, transit the request to the fonction def searchText""" extirper = filename.split(".")[-1] # extirper est une variable qui sépare le nom du fichier avec le . & ne garde que l'extension. # boucle if pour extraire des données d'un fichier docx if extirper == "docx": try: doc = docxpy.DOCReader(filename) doc.process() # l'import de docxpy utilise DCOreader pour extraire les données en txt text = doc.data['document'].replace('\n', '') # les données sont placé dans le dossier CVs/data.txt | puis ouvert (ci-dessous) with open("CVs/data.txt", "w") as fichier: for line in text: fichier.write(line) returnedSearch = searchText(listWords, filename) return returnedSearch except: print(f"erreur de lecture du fichier {filename}") return [] # boucle elif pour extraire des données d'un fichier PDF elif extirper == "pdf": try: pdfFileObj = open(filename, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) # discerner le nombre de pages permet de 'parse' toutes les pages num_pages = pdfReader.numPages count = 0 text = "" # la boucle while va lire chaque page while count < num_pages: pageObj = pdfReader.getPage(count) count += 1 text += pageObj.extractText() if text != "": text = text else: text = textract.process(fileurl, method='tesseract', language='eng') with open("CVs/data.txt", "w") as fichier: for line in text: fichier.write(line) returnedSearch = searchText(listWords, filename) return returnedSearch except: print(f"erreur de lecture du fichier {filename}") return [] # boucle if pour extraire des données d'un fichier HTML elif extirper == "html": with open(filename, "r") as rfichier: text = rfichier.read() with open("CVs/data.txt", "w") as fichier: for line in text: fichier.write(line) returnedSearch = searchText(listWords, filename) return returnedSearch else: print("extention de fichier non gérée") return []
#%% import docxpy import docx2txt import unicodedata # %% text = docxpy.process('data/article one multiple topics.docx') # %% doc = docxpy.DOCReader('data/articletwo.docx') # %% text = doc.process() # %% import docx2txt # %% text = docx2txt.process('data/articletwo.docx') # %% from textsummarization import * # %% methods = ['bert_sum'] for i in methods: print(i) print(extract_sum(text, 0.5, i)) # %% import torch import json