Exemplo n.º 1
0
def flag_private_urls(doc):
    '''This function takes a word document name as an input, identifies hyperlinks and their urls that do not fit the criteria of being in a newsletter. The function returns a table with columns 'Link_Text','Likely_Personal_URL?', and 'URL' with all the URLs within the document, sorted by the likeliness of being a personal link.'''

    #read in document
    #file = f'{doc}.docx'
    #create DOCReader object
    doc = docxpy.DOCReader(doc)
    #process file
    doc.process()
    #extract hyperlinks
    hyperlinks = doc.data['links']

    # create DataFrame using hyperlinks object
    df = pd.DataFrame(hyperlinks, columns=['Link_Text', 'URL'])
    #the 'text' column is byte type, convert to string type
    df['Link_Text'] = df['Link_Text'].str.decode("utf-8")

    #initiate a list of words to filter the dataframe, and return the result. This list will soon grow once we are able to obtain more examples of private urls.
    words_to_filter = ["personal", ':p:/t']
    df['Likely_Private_URL?'] = df.iloc[:, 1].str.contains(
        r'\b(?:{})\b'.format('|'.join(words_to_filter)))
    df = df.sort_values(by='Likely_Private_URL?', ascending=False)
    result = df[['Link_Text', 'Likely_Private_URL?', 'URL']]
    result = df.loc[df['Likely_Private_URL?'] == True]
    result = result.reset_index()
    return result
Exemplo n.º 2
0
def spot(filename, listWords):
    extirper = filename.split(".")[-1]

    if extirper == "docx":
        try:
            doc = docxpy.DOCReader(filename)
            doc.process()
            text = doc.data['document'].replace('\n', '')

            with open("CVs/data.txt", "w") as fichier:
                for line in text:
                    fichier.write(line)
            searchText(listWords)
        except:
            print(f"erreur de lecture du fichier {filename}")
######

    elif extirper == "pdf":
        try:
            pdfFileObj = open(filename, 'rb')
            pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

            num_pages = pdfReader.numPages
            count = 0
            text = ""

            while count < num_pages:
                pageObj = pdfReader.getPage(count)
                count += 1
                text += pageObj.extractText()
            if text != "":
                text = text
            else:
                text = textract.process(fileurl, method='tesseract', language='eng')

            with open("CVs/data.txt", "w") as fichier:
                for line in text:
                    fichier.write(line)
            searchText(listWords)
        except:
            print(f"erreur de lecture du fichier {filename}")
######        ######

    elif extirper == "html":
        with open(filename, "r") as rfichier:
            text = rfichier.read()
        with open("CVs/data.txt", "w") as fichier:
            for line in text:
                fichier.write(line)
        searchText(listWords)
    else:
        print("extention de fichier non gérée")
Exemplo n.º 3
0
def flag_private_urls_to_dict(filename):
    '''This function takes a word document name as an input, identifies hyperlinks and their urls that do not fit the criteria of being in a newsletter. The function returns a table with columns 'Link_Text','Likely_Personal_URL?', and 'URL' with all the URLs within the document, sorted by the likeliness of being a personal link.'''

    #read in document
    #file = f'{doc}.docx'
    #create DOCReader object
    doc = docxpy.DOCReader(filename)
    #process file
    doc.process()
    #extract hyperlinks
    hyperlinks = doc.data['links']

    # create DataFrame using hyperlinks object
    df = pd.DataFrame(hyperlinks, columns=['Link_Text', 'URL'])
    #the 'text' column is byte type, convert to string type
    df['Link_Text'] = df['Link_Text'].str.decode("utf-8")

    #initiate a list of words to filter the dataframe, and return the result. This list will soon grow once we are able to obtain more examples of private urls.
    words_to_filter = ["personal", ':p:/t']
    df['Likely_Private_URL?'] = df.iloc[:, 1].str.contains(
        r'\b(?:{})\b'.format('|'.join(words_to_filter)))
    df = df.sort_values(by='Likely_Private_URL?', ascending=False)
    #add name of document as a column for the excel file in sharepoint
    df['Article_Name'] = f'{filename}'
    df = df[['Article_Name', 'Link_Text', 'Likely_Private_URL?', 'URL']]
    df = df.loc[df['Likely_Private_URL?'] == True]

    if df.shape[0] == 0:
        result = {
            "Article_Name": f'{filename}',
            "Link_Text": 'No Bad Links Detected',
            "Likely_Private_URL": False,
            'URL': 'No Bad Links Detected'
        }
    elif df.shape[0] > 0:
        for i, row in df.iterrows():
            result = {
                "Article_Name": row['Article_Name'],
                "Link_Text": row['Link_Text'],
                "Likely_Private_URL": row['Likely_Private_URL?'],
                'URL': row['URL']
            }

    return result
Exemplo n.º 4
0
def printtext(file):
    file_name, file_extension = os.path.splitext(file)
    junk, filenamenopath = os.path.split(file_name)
    filename = file_name + ".txt"
    if (file_extension == ".docx"):
        text = docxpy.process(file)
        doc = docxpy.DOCReader(file)
        doc.process()  # process file
    elif (file_extension == ".pdf"):
        text = extract_text(file)
    elif (file_extension == ".txt"):
        text = open(file, "r").read()
    else:
        text = "I don't know this file type"
    text = re.sub('[^A-Za-z0-9\s]+', '', text)
    assignmentfile = open("../assignment.txt", "w")
    assignmentfile.write(text)
    #print text
    return text
Exemplo n.º 5
0
import docxpy

file = 'my_word_file.docx'

# extract text
text = docxpy.process(file)
print(text)

# extract text and write images in /tmp/img_dir
#text = docxpy.process(file, "/tmp/img_dir")

# if you want the hyperlinks
doc = docxpy.DOCReader(file)
doc.process()  # process file
hyperlinks = doc.data['links']
print(hyperlinks)
Exemplo n.º 6
0

j = 0
k = 1
j += k
a = []
b = []
hypertext = ""
left = 0

for i in range(0, 228):
    try:
        zee = r'C:\Users\ahmad\Desktop\Projects\wordtocsv\New folder\1 ({}).docx'.format(
            j)
        print(zee)
        docpx = docxpy.DOCReader(zee)
        docpx.process()  # process file
        hyperlinks = docpx.data['links']
        for z in hyperlinks:
            z1 = z[1]
            z2 = z[0].decode('utf-8')
            hypertext += " {} {} , ".format(z2, z1)
        hyper = hypertext
        doc = docx.Document(zee)
        tables = doc.tables
        for table in tables:
            for row in table.rows:
                for cell in row.cells:
                    d = cell.text
                    a.append(d)
                    #                     print (cell.text)
Exemplo n.º 7
0
def read_docx_files():
    files = glob.glob(download_path + "\\DocxFiles\\DP*")
    os.chdir(download_path)
    for file in files:
        if os.path.exists(download_path + "\\TestTemplates\\" +
                          os.path.basename(file)):
            os.remove(download_path + "\\TestTemplates\\" +
                      os.path.basename(file))
        docx_handler = docx.Document(file)
        docx_tables = docx_handler.tables
        docx_hyperlink_handler = docxpy.DOCReader(file)
        docx_hyperlink_handler.process()
        hyperlinks = docx_hyperlink_handler.data['links']
        test_scenario_hyperlink_text = str(hyperlinks[0][0])[2:-1]
        jira_id = docx_tables[0].rows[0].cells[0].text
        jira_test_id = jira_id.split()[0]

        jira_test_desc = ""
        for i in range(len(docx_tables)):
            if "Description" in docx_tables[i].rows[0].cells[0].text:
                jira_test_desc = docx_tables[i +
                                             1].rows[0].cells[0].text.strip()
                break

        zephyr_tests = None
        for x in range(len(docx_tables[2].rows)):
            if "Zephyr" in docx_tables[2].rows[x].cells[0].text:
                zephyr_tests = docx_tables[2].rows[x].cells[1]
                break
        if not zephyr_tests:
            # Move test without steps to separate dir
            if not os.path.exists(download_path +
                                  "\\DocxFiles\\TestsWithoutSteps"):
                os.makedirs(download_path + "\\DocxFiles\\TestsWithoutSteps")
            shutil.move(
                download_path + "\\DocxFiles\\" + os.path.basename(file),
                download_path + "\\DocxFiles\\TestsWithoutSteps\\" +
                os.path.basename(file))
            continue
        zephyr_tests_table = zephyr_tests.tables

        zephyr_rows = zephyr_tests_table[0].rows  # get row id's
        zephyr_rows = zephyr_rows[
            1:]  # remove first cell from all rows (e.g. "Test Step", "Test Data", etc.)

        #  Test Steps
        list_of_test_steps = []
        for row in zephyr_rows:
            list_of_test_steps.append(row.cells[1].text)

        #  Test Conditions
        list_of_test_conditions = []
        for row in zephyr_rows:
            list_of_test_conditions.append(row.cells[2].text)

        #  Expected results
        list_of_exptected_results = []
        for row in zephyr_rows:
            list_of_exptected_results.append(row.cells[3].text)

        number_of_teststeps = zephyr_tests_table[0].rows[-1].cells[0].text
        file_save_path = download_path + "\\TestTemplates\\" + os.path.basename(
            file)

        final_docx_template = docx.Document(
            str(script_path) + "\\SampleTestScripts1.docx")
        final_docx_table = final_docx_template.tables
        font = final_docx_template.styles['Normal'].font
        font.name = 'Calibri'
        paragraph = final_docx_template.styles['Normal'].paragraph_format
        paragraph.space_after = Pt(3)
        paragraph.left_indent = Pt(0)
        heading1 = final_docx_template.styles['Heading 1'].paragraph_format
        heading1.space_before = Pt(0)
        for x in range(1, int(number_of_teststeps)):
            final_docx_table[0].add_row()

        os.chdir(download_path)
        test_id = final_docx_table[0].rows[0].cells[2].paragraphs[0]
        test_id.add_run(jira_test_id)
        test_scenario = final_docx_table[0].rows[1].cells[2].paragraphs[0]
        test_scenario.add_run(test_scenario_hyperlink_text)
        final_docx_table[0].rows[1].cells[2].paragraphs[
            0].paragraph_format.left_indent = Pt(0)
        test_description = final_docx_table[0].rows[2].cells[2].paragraphs[0]
        test_description.add_run(jira_test_desc)

        steps_only_table = final_docx_table[0].rows[4:]

        # steps_only_table[0].cells[0].paragraphs[0].style = rws_template.styles['Normal']
        for x in range(0, int(number_of_teststeps)):
            steps_only_table[x].cells[0].paragraphs[
                0].paragraph_format.left_indent = Pt(12)

        for x in range(0, int(number_of_teststeps)):
            steps_only_table[x].cells[0].paragraphs[0].add_run(
                str(x + 1) + ".")
        for x in range(0, int(number_of_teststeps)):
            steps_only_table[x].cells[1].paragraphs[0].add_run(
                list_of_test_steps[x])
        for x in range(0, int(number_of_teststeps)):
            steps_only_table[x].cells[2].paragraphs[0].add_run(
                list_of_test_conditions[x])
        for x in range(0, int(number_of_teststeps)):
            steps_only_table[x].cells[3].paragraphs[0].add_run(
                list_of_exptected_results[x])

        # final_docx_template.add_page_break()
        final_docx_template.save(file_save_path)
Exemplo n.º 8
0
import docxpy
import os

doc = docxpy.DOCReader('Demo_WORD.docx')
doc.process()
linkextracts = doc.data['links']

#doc.data.keys() --> Since doc.data is of type dict
#['header', 'document', 'links', 'footer']
for i in doc.links.values():
    print(i)
'''
Alternate-method
#iterating a 2D list. In this case, it's a list of tuples
links = []
for i,j in linkextracts:
    links.append(j)

print(len(links))

#printing out the list called links
for k in links:
    print(k)
'''
Exemplo n.º 9
0
def spot(filename, listWords):
    """ The fonction spot return a number of occur of searched words ;
    listWords in this fonction, transit the request to the fonction def searchText"""

    extirper = filename.split(".")[-1]
    #   extirper est une variable qui sépare le nom du fichier avec le . & ne garde que l'extension.
    #   boucle if pour extraire des données d'un fichier docx

    if extirper == "docx":
        try:
            doc = docxpy.DOCReader(filename)
            doc.process()
    # l'import de docxpy utilise DCOreader pour extraire les données en txt
            text = doc.data['document'].replace('\n', '')
    # les données sont placé dans le dossier CVs/data.txt | puis ouvert (ci-dessous)
            with open("CVs/data.txt", "w") as fichier:
                for line in text:
                    fichier.write(line)
            returnedSearch = searchText(listWords, filename)
            return returnedSearch
        except:
            print(f"erreur de lecture du fichier {filename}")
            return []

    # boucle elif pour extraire des données d'un fichier PDF
    elif extirper == "pdf":
        try:
            pdfFileObj = open(filename, 'rb')
            pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    # discerner le nombre de pages permet de 'parse' toutes les pages
            num_pages = pdfReader.numPages
            count = 0
            text = ""
    # la boucle while va lire chaque page
            while count < num_pages:
                pageObj = pdfReader.getPage(count)
                count += 1
                text += pageObj.extractText()
            if text != "":
                text = text
            else:
                text = textract.process(fileurl, method='tesseract', language='eng')

            with open("CVs/data.txt", "w") as fichier:
                for line in text:
                    fichier.write(line)
            returnedSearch = searchText(listWords, filename)
            return returnedSearch

        except:
            print(f"erreur de lecture du fichier {filename}")
            return []

    # boucle if pour extraire des données d'un fichier HTML
    elif extirper == "html":
        with open(filename, "r") as rfichier:
            text = rfichier.read()
        with open("CVs/data.txt", "w") as fichier:
            for line in text:
                fichier.write(line)
        returnedSearch = searchText(listWords, filename)
        return returnedSearch
    else:
        print("extention de fichier non gérée")
        return []
Exemplo n.º 10
0
#%%
import docxpy
import docx2txt
import unicodedata

# %%
text = docxpy.process('data/article one multiple topics.docx')

# %%
doc = docxpy.DOCReader('data/articletwo.docx')

# %%
text = doc.process()

# %%
import docx2txt


# %%
text = docx2txt.process('data/articletwo.docx')

# %%
from textsummarization import *
# %%
methods = ['bert_sum']
for i in methods:
    print(i)
    print(extract_sum(text, 0.5, i))
# %%
import torch
import json