예제 #1
0
def mine_text(pdf_path):
    # with open('samples/simple1.pdf', 'rb') as fin:
    #     extract_text_to_fp(fin, output_string)
    # print(output_string.getvalue().strip())

    # text = extract_text(pdf_path)
    # print(repr(text))
    # print(text)

    if sys.version_info > (3, 0):
        from io import StringIO
    else:
        from io import BytesIO as StringIO
    from pdfminer.layout import LAParams
    output_string = StringIO()
    with open(pdf_path, 'rb') as fin:
        extract_text_to_fp(fin,
                           output_string,
                           laparams=LAParams(),
                           output_type='html',
                           codec=None)
    str_html = output_string.getvalue().strip()
    with open('temp.html', 'w') as fh:
        fh.write(str_html)

    # url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
    #
    # # Make a GET request to fetch the raw HTML content
    # html_content = requests.get(url).text

    # Parse the html content
    soup = BeautifulSoup(str_html, "lxml")
예제 #2
0
    def _parse_calendar(self, response):
        """Parse dates and details from schedule PDF"""
        lp = LAParams(line_margin=0.1)
        out_str = StringIO()
        extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp)
        pdf_text = re.sub(r"\s+", " ", out_str.getvalue()).replace(" ,", ",")

        for idx, date_str in enumerate(
                re.findall(r"[a-zA-Z]{3,10} \d{1,2}, \d{4}", pdf_text)):
            # Ignore every other item
            if idx % 2 == 1:
                continue
            meeting = Meeting(
                title="Urban Design and Historic Preservation Commission",
                description="",
                classification=COMMISSION,
                start=self._parse_start(date_str),
                end=None,
                all_day=False,
                time_notes="Confirm details with agency",
                location=self.location,
                links=[],
                source=self.start_urls[0],
            )

            meeting["status"] = self._get_status(meeting)
            meeting["id"] = self._get_id(meeting)

            yield meeting
    def _parse_calendar(self, response):
        lp = LAParams(line_margin=5.0)
        out_str = StringIO()
        extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp)
        pdf_text = out_str.getvalue()
        split_dates = re.split(r"([A-Z][a-z]{2,8}\s+\d{1,2}, \d{4}[ \n$])", pdf_text, flags=re.M)
        date_groups = [split_dates[1]]
        for split_str in split_dates[2:]:
            if re.search(r"([A-Z][a-z]{2,8}\s+\d{1,2}, \d{4}[ \n$])", split_str):
                date_groups.append(split_str)
            else:
                date_groups[-1] = date_groups[-1] + split_str

        for date_item_str in date_groups:
            item = re.sub(r" +", " ", date_item_str).strip()
            start = self._parse_start(item)
            if not start:
                continue
            meeting = Meeting(
                title="Board of Trustees",
                description="",
                classification=BOARD,
                start=start,
                end=None,
                all_day=False,
                time_notes="",
                location=self._parse_location(item),
                links=self.agenda_map.get(start.date(), []),
                source=response.url,
            )

            meeting["status"] = self._get_status(meeting, text=item)
            meeting["id"] = self._get_id(meeting)

            yield meeting
예제 #4
0
def check_first_page_is_cover(pdf: bytes) -> bool:
    """Reads pdf and returns True if it is a cover page"""
    with io.StringIO() as test_string:
        params = layout.LAParams(line_margin=2)
        extract_text_to_fp(pdf, test_string, page_numbers=[0], laparams=params)
        first_page = test_string.getvalue()
        return len(first_page.split()) <= 100
예제 #5
0
파일: text.py 프로젝트: kmacprt/pdfbot
def get_pdf_text(update, context, is_file):
    if not check_user_data(update, context, PDF_INFO):
        return ConversationHandler.END

    _ = set_lang(update, context)
    update.effective_message.reply_text(
        _("Extracting text from your PDF file"), reply_markup=ReplyKeyboardRemove()
    )

    with tempfile.NamedTemporaryFile() as tf:
        user_data = context.user_data
        file_id, file_name = user_data[PDF_INFO]
        pdf_file = context.bot.get_file(file_id)
        pdf_file.download(custom_path=tf.name)

        with tempfile.TemporaryDirectory() as dir_name:
            tmp_text = tempfile.TemporaryFile()
            with open(tf.name, "rb") as f:
                extract_text_to_fp(f, tmp_text)

            tmp_text.seek(0)
            pdf_texts = textwrap.wrap(tmp_text.read().decode("utf-8").strip())
            out_fn = os.path.join(dir_name, f"{os.path.splitext(file_name)[0]}.txt")
            send_pdf_text(update, context, pdf_texts, is_file, out_fn)

    # Clean up memory
    if user_data[PDF_INFO] == file_id:
        del user_data[PDF_INFO]

    return ConversationHandler.END
예제 #6
0
 def _parse_notice(self, response):
     """
     Parse meeting from notice text if embedded text, otherwise use text in meta
     """
     lp = LAParams(line_margin=0.1)
     out_str = StringIO()
     extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp)
     pdf_text = out_str.getvalue()
     if not pdf_text.strip():
         yield self._parse_meeting_text(response.meta["meeting_text"],
                                        response.url)
     else:
         date_match = re.search(r"[A-Z][a-z]{2,8} \d{1,2},? \d{4}",
                                response.meta["meeting_text"])
         if date_match:
             date_obj = datetime.strptime(
                 date_match.group().replace(",", ""), "%B %d %Y").date()
             if "Notice" not in [
                     link["title"] for link in self.link_date_map[date_obj]
             ]:
                 self.link_date_map[date_obj].append({
                     "title": "Notice",
                     "href": response.url
                 })
         yield self._parse_meeting_text(re.sub(r"\s+", " ", pdf_text),
                                        response.meta["source"])
예제 #7
0
def pdf_to_text(html=False):
    # Returns a dictionary where keyword is the file,
    # and value is the TEXT content of the pdf
    pdfs = get_pdfs()
    rename_files()
    all_tasks = {}
    for pdf in pdfs:
        pdf = pdf.strip(".pdf")
        if html:
            output_string = StringIO()
            with open(Path(PDF_FOLDER) / Path(pdf + ".pdf"), "rb") as fin:
                extract_text_to_fp(
                    fin,
                    output_string,
                    output_type="html",
                    codec=None,
                )
                all_tasks[pdf] = output_string.getvalue()

        else:
            text = extract_text(Path(PDF_FOLDER) / Path(pdf + ".pdf"))
            # Replace \n to html linebreaks:
            text = text.replace("\n", "<br />\n").strip()
            all_tasks[pdf] = text
    return all_tasks
예제 #8
0
def convert_pdf_to_text(path):
    output = StringIO()

    with open(path, "rb") as f:
        extract_text_to_fp(f, output)

    return output.getvalue()
예제 #9
0
def compare(file1, file2, **kwargs):
    # If any LAParams group arguments were passed,
    # create an LAParams object and
    # populate with given args. Otherwise, set it to None.
    if kwargs.get('laparams', None) is None:
        laparams = layout.LAParams()
        for param in ("all_texts", "detect_vertical", "word_margin",
                      "char_margin", "line_margin", "boxes_flow"):
            paramv = kwargs.get(param, None)
            if paramv is not None:
                laparams[param] = paramv
        kwargs['laparams'] = laparams

    s1 = io.StringIO()
    with open(file1, "rb") as fp:
        high_level.extract_text_to_fp(fp, s1, **kwargs)

    s2 = io.StringIO()
    with open(file2, "rb") as fp:
        high_level.extract_text_to_fp(fp, s2, **kwargs)

    import difflib
    s1.seek(0)
    s2.seek(0)
    s1, s2 = s1.readlines(), s2.readlines()

    import os.path
    try:
        extension = os.path.splitext(kwargs['outfile'])[1][1:4]
        if extension.lower() == 'htm':
            return difflib.HtmlDiff().make_file(s1, s2)
    except KeyError:
        pass
    return difflib.unified_diff(s1, s2, n=kwargs['context_lines'])
예제 #10
0
def get_decision_citation_item(source: PDPCDecisionItem, options: Options = None) -> (str, str):
    """
    Gets the citation and case number for a PDPCDecisionItem.

    :param options:
    :param source: The PDPCDecisionItem to get the citation and case number.
    :return: A tuple consisting of (citation, case_number)
    """
    from pdfminer.high_level import extract_text_to_fp
    import io
    import re
    citation = ''
    case_number = ''
    if check_pdf(source.download_url):
        with PDFFile(source, options) as pdf, io.StringIO() as output_string:
            extract_text_to_fp(pdf, output_string, page_numbers=[0, 1])
            contents = output_string.getvalue()
        summary_match = re.search(r'SUMMARY OF THE DECISION', contents)
        if not summary_match:
            citation_match = re.search(r'(\[\d{4}])\s+((?:\d\s+)?[A-Z|()]+)\s+\[?(\d+)\]?', contents)
            if citation_match:
                citation = citation_match.expand(r'\1 \2 \3')
            else:
                logger.warning(f'No citation found for {source}')
        else:
            logger.info(f'Decision <{source}> is a summary and does not have a a citation.')
        case_match = re.search(r'DP-\s*(\w*)-\s*(\w*)', contents)
        if case_match:
            case_number = case_match.expand(r'DP-\1-\2')
        else:
            logger.warning(f'No case number found for {source}')
    return citation, case_number
예제 #11
0
def get_contents_with_attributes(path):
    output_io = io.StringIO()
    with open(path, 'rb') as input:
        extract_text_to_fp(input,
                           output_io,
                           laparams=LAParams(line_margin=0.21,
                                             line_overlap=0.4,
                                             all_texts=False),
                           output_type='html',
                           codec=None)
    html = BeautifulSoup(output_io.getvalue(), 'html.parser')
    final_content = []
    for div in html.find_all("div"):
        temp_div = []
        for span in div.find_all("span"):
            if 'bold' in span['style'].lower():
                if span.text.strip():
                    temp_div.append(f'<b>{span.text.strip()}</b>')
            if 'bold' not in span['style'].lower():
                if span.text.strip():
                    temp_div.append(span.text.strip())
        if temp_div:
            final_content.append(" ".join(temp_div))
    output_io.close()
    return final_content
예제 #12
0
def convert_file(filepath: Path) -> None:
    output_filepath = filepath.with_suffix(".txt")
    with filepath.open("rb") as pdf_file:
        output_string = StringIO()
        extract_text_to_fp(pdf_file, outfp=output_string, laparams=LAParams(), output_type="text")
        with output_filepath.open("w") as txt_file:
            txt_file.write(output_string.getvalue().strip())
예제 #13
0
 def attribute_checking(self,input_pdf, text,encoding):
     text_out = []
     if input_pdf.startswith('\\'):
         if not self.output_io.getvalue():
             extract_text_to_fp(self.input_file, self.output_io,laparams=LAParams(line_margin=0.18, line_overlap=0.4, all_texts=False),
                                    output_type='html', codec=None)
         else:
             pass
     else:
         if not self.output_io.getvalue():
             with open(self.flat_pdf,'rb') as input:
                 extract_text_to_fp(input, self.output_io,laparams=LAParams(line_margin=0.18, line_overlap=0.4, all_texts=False),output_type='html', codec=None)
         else:
             pass
     html = BeautifulSoup(self.output_io.getvalue(), 'html.parser')
     results = html.find_all(lambda tag: tag.name == "div" and ' '.join(text.replace('\n', '').split()[:3]) in tag.text.replace('\n', ''))
     if results:
         if 'bold' in str(results[-1]).lower():
             for span in results[-1]:
                 if 'bold' in span['style'].lower():
                     text_out.append(f'<b>{span.text}</b>')
                 if 'bold' not in span['style'].lower():
                     text_out.append(span.text)
             # print(' '.join(text_out))
             return ' '.join(text_out)
         else:
             return None
     else:
         return None
def attribute(input_pdf, pages, text):
    text_out = []
    output_io = io.StringIO()
    with open(input_pdf, 'rb') as input:
        extract_text_to_fp(input,
                           output_io,
                           page_numbers=[int(pages) - 1],
                           laparams=LAParams(line_margin=0.18,
                                             line_overlap=0.4,
                                             all_texts=False),
                           output_type='html',
                           codec=None)

    html = BeautifulSoup(output_io.getvalue(), 'html.parser')
    results = html.find_all(lambda tag: tag.name == "div" and fuzz.ratio(
        text.lower(),
        tag.text.lower().replace('/n', '')) > 70)
    #     print(html)
    if results:
        if 'bold' in str(results[-1]).lower():
            for span in results[-1]:
                if 'bold' in span['style'].lower():
                    new_text = span.text.split('\n')
                    text_out.append(f'&lt;b&gt;{new_text[0]}&lt;/b&gt;')
                if 'bold' not in span['style'].lower():
                    #                 print('yes')
                    new_text = span.text.split('\n')
                    text_out.append(new_text[0])
            #             print(' '.join(text_out))
            return ' '.join(text_out)
        else:
            return None
예제 #15
0
def Cognitive_PDF(PATH_PDF):
    from io import StringIO
    from pdfminer.high_level import extract_text_to_fp

    output_string = StringIO()
    with open(PATH_PDF, 'rb') as fin:
        extract_text_to_fp(fin, output_string)
    return output_string.getvalue()
def extracttextfp(i):
    fr = open(i, 'rb')
    output = StringIO()
    extract_text_to_fp(fr, output, output_type='text', laparams=LAParams())
    fw = open('MomentText/Round 2/file.txt', 'w', encoding='utf-8')
    fw.write(output.getvalue())
    fw.close()
    return
예제 #17
0
 def parse_file(self):
     output = StringIO()
     with open(self.filepath, 'rb') as pdf_file:
         extract_text_to_fp(pdf_file,
                            output,
                            laparams=LAParams(),
                            output_type='html',
                            codec=None)
         self.tree = etree.parse(StringIO(output.getvalue()),
                                 etree.HTMLParser())
 def read(self, path, html=False):
     text = StringIO()
     if html:
         with open(path, "rb") as f:
             extract_text_to_fp(f, text, laparams=LAParams(),
                                output_type="html", codec=None)
         text = text.getvalue()
     else:
         text = extract_text(path)
     return text
 def _parse_agenda(self, response):
     lp = LAParams(line_margin=5.0)
     out_str = StringIO()
     extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp)
     pdf_text = out_str.getvalue()
     date_match = re.search(r"[A-Z][a-z]{2,8} \d{1,2},? \d{4}", pdf_text)
     if date_match:
         date_str = date_match.group().replace(",", "")
         date_obj = datetime.strptime(date_str, "%B %d %Y").date()
         self.agenda_map[date_obj] = [{"title": "Agenda", "href": response.url}]
예제 #20
0
def extract_pdf_pdfminer_format_without_output(pdf_path):
    output_string = StringIO()
    with open(pdf_path, 'rb') as f:
        extract_text_to_fp(f,
                           output_string,
                           laparams=LAParams(),
                           output_type='html',
                           codec=None)
        context = output_string.getvalue()
    return context
예제 #21
0
def convert_pdf_to_xml(path):
    '''get all pdf data as xml file format'''
    output = StringIO()
    with open(path, 'rb') as pdf_file:
        extract_text_to_fp(pdf_file,
                           output,
                           laparams=LAParams(),
                           output_type='xml',
                           codec=None)
    xml = output.getvalue()
    return xml
예제 #22
0
 def text_pdf_fp(self,
                 document: str,
                 n_page: int = None,
                 max_pages: int = 0) -> str:
     output_string = StringIO()
     with open(document, 'rb') as f:
         extract_text_to_fp(f,
                            output_string,
                            maxpages=max_pages,
                            page_numbers=n_page)
     return self.clean_text(output_string.getvalue())
예제 #23
0
    def parse_pdf(self):
        """
        The meat of the SportingCode instance. This attempts to take the
        downloaded PDF contents and convert it into Python objects that can
        be more easily parsed, read, and manipulated.
        """
        if self.parsed:
            return

        self.raw_content.write(urlopen(self.url).read())
        out_io = StringIO()
        extract_text_to_fp(
            self.raw_content,
            out_io,
            laparams=LAParams(),
            output_type='text',
            strip_control=True,
            codec=None
        )
        # Saved directly as it was parsed by pdfminer.six
        self.raw_parsed_content = out_io.getvalue().strip().replace("", "")

        # We then run the content through a bunch of custom filters that will
        # massage the PDF contents into something more friendly to parse
        page_splitter = '\n\n'+self.PAGE_BREAK_INDICATOR

        # 1. Since each page has the same footer, we can use that to replace
        #    the hard to read page breaks with something that says `<! -- PAGE BREAK -->`
        self.parsed_content = re.sub(
            r'[\n ]*Version - 2018.09[\n ]*\d+[\n ]*',
            page_splitter,
            self.raw_parsed_content
        )

        # 2. Remove the first couple pages (title and table of contents) since we
        #    don't really care to parse these
        self.parsed_content = self.parsed_content.split(
            page_splitter,
            self.start_page_parsing_at-1
        )[self.start_page_parsing_at-1]

        # 3. Iterate through the sporting code and if the line contains a section ID or index
        #    then we will create a new section, or else we will keep appending to the current.
        self.parse_content_into_sections()

        # 4. Try to build a section hierarchy, meaning 1.1. is a child of 1.
        #    This will allow us to easily grab all sections including children
        self.build_section_hierarchy()

        # Uncomment this if you want to write the sporting code to a file to check it
        # with open('sporting_code.md', 'w') as f:
        #     f.write(self.markdown())

        self.parsed = True
예제 #24
0
def extracttexthtml(i):
    fr = open(i, 'rb')
    output = StringIO()
    extract_text_to_fp(fr,
                       output,
                       output_type='html',
                       laparams=LAParams(),
                       codec=None)
    fw = open('K3407623935.html', 'w', encoding='utf-8')
    fw.write(output.getvalue())
    fw.close()
    return
예제 #25
0
def extract_text_from_pdf_bio(pdf_fo: BinaryIO) -> str:
    """
    Extracts text from a PDF

    :param pdf_fo: a byte file object representing a PDF file
    :return: extracted text
    :raises pdfminer.pdftypes.PDFException: on invalid PDF
    """
    out_fo = StringIO()
    layout = LAParams(all_texts=True)
    extract_text_to_fp(pdf_fo, out_fo, laparams=layout)
    return out_fo.getvalue()
예제 #26
0
 def _parse_pdf(self, response):
     lp = LAParams(line_margin=5.0)
     out_str = StringIO()
     extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp)
     pdf_text = re.sub(r"\s+", " ", out_str.getvalue()).strip()
     date_match = re.search(r"[A-Z][a-z]{2,8} \d{1,2},? \d{4}", pdf_text)
     if not date_match:
         return
     date_obj = datetime.strptime(date_match.group().replace(",", ""), "%B %d %Y").date()
     self.link_date_map[date_obj].append({
         "title": "Agenda" if "agenda" in response.url.lower() else "Minutes",
         "href": response.url,
     })
예제 #27
0
    def _parse_pdf(self, response):
        """Parse data from PDF file of schedule"""
        lp = LAParams(line_margin=5.0)
        out_str = StringIO()
        extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp)
        pdf_text = out_str.getvalue()
        split_dates = re.split(r"([A-Z][a-z]{2,8}\s+\d{1,2}[ \n$])", pdf_text)
        desc_str = split_dates[0]
        self._validate_location(desc_str)

        date_groups = [split_dates[1]]
        for split_line in split_dates[2:]:
            if re.search(r"([A-Z][a-z]{2,8}\s+\d{1,2}[ \n$])", split_line):
                date_groups.append(split_line)
            else:
                date_groups[-1] = date_groups[-1] + split_line
        year_str = re.search(r"\d{4}", desc_str).group()

        for date_group in date_groups:
            item = date_group.strip()
            date_str = re.search(r"^[A-Z][a-z]{2,8} \d{2}", item).group()
            if "Hearing" in item:
                time_strs = [
                    t[0]
                    for t in re.findall(r"(\d{1,2}(:\d{2})? [APM]{2})", item)
                ]
                details = [
                    ("Public Hearing", time_strs[0].lower()),
                    ("Board", time_strs[1].lower()),
                ]
            else:
                details = [("Board", "5:30 pm")]

            for title, start_str in details:
                meeting = Meeting(
                    title=title,
                    description="",
                    classification=self._parse_classification(title),
                    start=self._parse_start(date_str, start_str, year_str),
                    end=None,
                    all_day=False,
                    time_notes="",
                    location=self.location,
                    links=[],
                    source=response.url,
                )

                meeting["status"] = self._get_status(meeting, text=item)
                meeting["id"] = self._get_id(meeting)

                yield meeting
    def _parse_schedule_pdf(self, response):
        """Parse dates and details from schedule PDF"""
        lp = LAParams(line_margin=0.1)
        out_str = StringIO()
        extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp)
        pdf_text = out_str.getvalue().replace("\n", "")
        # Remove duplicate characters not followed by lowercase (as in 5:00pm)
        clean_text = re.sub(r"([A-Z0-9:])\1(?![a-z])", r"\1", pdf_text, flags=re.M)
        # Remove duplicate spaces
        clean_text = re.sub(r"\s+", " ", clean_text)
        year_str = re.search(r"\d{4}", clean_text).group()
        self._validate_location(clean_text)

        for date_str in re.findall(r"[A-Z]{3,10}\s+\d{1,2}(?!\d)", clean_text):
            self.meeting_starts.append(self._parse_start(date_str, year_str))
예제 #29
0
    def parse(self, fname):
        """ Assumes the input file [fname] is small enough to read in its entirety\
            into memory.  This should be fixed to use a temporary file otherwise. """

        outfp = io.StringIO()
        with open(fname, "rb") as fp:

            try:
                high_level.extract_text_to_fp(fp, **locals())
            except pdfdocument.PDFTextExtractionNotAllowed as e:
                raise ReaderException(e)
            except pdfparser.PDFSyntaxError as e:
                raise ReaderException(e)

        outfp.seek(0)
        contents = outfp.read()

        return PdfReader._replace_cids_(contents)
예제 #30
0
def download_pdf_url(url):
    headers = {
        'User-Agent':
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
    }
    html = requests.get(url, headers=headers, timeout=10).content
    with open('temp.pdf', 'wb') as f:
        f.write(html)
    output_string = StringIO()
    with open('temp.pdf', 'rb') as f:
        try:
            extract_text_to_fp(f,
                               output_string,
                               laparams=LAParams(),
                               output_type='html',
                               codec=None)
        except (PDFSyntaxError):
            print('Could not read this pdf')
    return output_string.getvalue().strip()
예제 #31
0
파일: utils.py 프로젝트: tutorcruncher/pydf
def pdf_text(pdf_data: bytes) -> str:
    laparams = pdfminer.layout.LAParams()
    output = StringIO()
    high_level.extract_text_to_fp(BytesIO(pdf_data), output, laparams=laparams)
    return output.getvalue()