def extract_text_from_pdf():
    interruption_pdfs = InterruptionPdf.objects.filter(pdf_text__isnull=True)
    for interruption_pdf in interruption_pdfs:
        with open(interruption_pdf.pdf_file.path, "rb") as f:
            pdf = PDF(f)

        pdf_text, _ = InterruptionPdfText.objects.get_or_create(
            pdf=interruption_pdf, defaults={"pdf_text": "\n".join(pdf)})
        print("Extracted text from PDF {}".format(pdf_text))
示例#2
0
def _need_ocr(inpdf, minwords):
    with open(inpdf, "rb") as f:
        pdf = PDF(f)
    text = ' '.join(pdf)
    word_list = text.replace(',', '').replace('\'',
                                              '').replace('.',
                                                          '').lower().split()
    if len(word_list) > minwords:
        return False, text
    else:
        return True, ""
示例#3
0
    def to_text(self) -> TextInterface:
        """
        Converts the given PDF into text. If the PDF is
        not valid it just return an empty string
        :return: A string that represents the PDF
        """
        if self.is_valid():
            buffer = BytesIO(self.data)
            pdf = PDF(buffer)
            return self._text_adapter_klass('\n\n'.join(pdf))

        return self._text_adapter_klass('')
示例#4
0
def main(args):
    files = [x for x in os.listdir(args.path) if x[-4:] == ".pdf"]

    for file in files:
        with open(f"{args.path}/{file}", 'rb') as fd:
            pdf = PDF(fd)
        idx, name = 0, file.split('.')[0]
        for page in pdf:
            line = page.split("\n")[0].split()
            if len(line) == 10 and "CARTELERA" in line:
                with open(f"{args.folder}{name}-page{idx}.data", 'w') as fd:
                    fd.write(page)
示例#5
0
 def readTextFromPdf(filename):
     text = ""
     with open(filename, "rb") as filehandle:
         try:
             # get array of pages, join them into single string,
             # split the string by new lines,
             # join it into a single string, strip whitespaces
             # now we got the whole text in a single line string
             text = " ".join(" ".join(PDF(filehandle)).split()).strip()
         except:
             logging.exception(
                 f"Cannot extract text from {path.basename(filename)}.",
                 exc_info=True)
     if text == "":
         raise Exception("PDF does not contain text.")
     return text
示例#6
0
    def analyze_pdf(self):
        if self.pdf:
            return

        file = self.get_djbr_filepath()
        with open(file, "rb") as f:
            self.pdf = PDF(f)
        self.pdf = "".join(self.pdf)

        # djbr:version
        pattern = '(versi.n).*'
        result = re_search(pattern, self.pdf, re_unicode)
        result = str(result.group()).split()
        for value in result:
            if value.replace('.', '').isdigit():
                self.djbr_version = value
                break
示例#7
0
def extract_corpora_from_file(filepath,
                              file_index,
                              file_extension="pdf") -> str:
    """

    :type filepath: str
    :type file_index: int
    :type file_extension: str
    """
    corpora = str()
    if file_extension == "pdf":
        with open(filepath, "rb") as file:
            pdf_file = PDF(file)
        corpora = "\n".join(pdf_file)
    else:
        with open(str(filepath), "r+", encoding='utf-8') as file:
            corpora = file.read()
    print(
        f"Processing of file [{file_index}] -> \'{filepath}\' was completed successfully!"
    )
    return corpora
示例#8
0
from os.path import splitext
from pdftotext import PDF
from textract import process
from chardet import detect

filenames=["Opsti-deo.pdf"]
for filename in filenames:
	no_ext = splitext(filename)[0]

	with open(no_ext+"_pdftotext.txt", "w", encoding = "utf-8") as file:
		with open(filename, "rb") as f:
			pdf = PDF(f)
			for page in pdf:
				file.write(page)

	text = process(filename, method = "pdftotext")
	text = text.decode(detect(text)["encoding"])

	with open(no_ext+"_textract.txt", "w", encoding = "utf-8") as file:
		file.write(text)
示例#9
0
from pdftotext import PDF
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver_loc = '/Users/avinaashkoganti/chromedriver'

#Scraping Index for VTI
marketCaps = []
dates = []
r = requests.get("http://www.crsp.org/fact-sheet-archive")
soup = BeautifulSoup(r.text,features="html.parser")
for link in soup.find_all('a'):
    if re.search('crsptm', str(link.get('href')), re.IGNORECASE):
        pdf = requests.get(link.get('href'))
        open_pdf = io.BytesIO(pdf.content)
        read_pdf = PDF(open_pdf)[1]
        dates.append(link.text.replace('Quarter Ending ', ''))
        marketCaps.append(float(re.findall('INDEX MARKET CAP\s+\d+,\d+,\d+', read_pdf, re.IGNORECASE)[0].upper().replace('INDEX MARKET CAP','').replace(',',''))/1000)
vtiDF = pd.DataFrame({'date':pd.to_datetime(dates, infer_datetime_format=True),'Market Cap':marketCaps}).sort_values(by='date', ascending=False).reset_index(drop=True)
vtiDF.to_csv('Data/vti_MC_data.csv',index=False)
print("VTI")
print(vtiDF)

#Scraping Index for VXUS
dates = []
marketCaps = []
links = []
driver = webdriver.Chrome(driver_loc,options=options)
driver.get('https://www.ftserussell.com/analytics/factsheets/home/search')
archive = driver.find_element_by_xpath('//a[@title="FTSE Global All Cap ex US Index"]/following-sibling::a')
driver.execute_script("arguments[0].click();", archive)
示例#10
0
def read_pdf(path: str):
    with open(path, "rb") as file:
        return PDF(file)
示例#11
0
def get_document(fp: Path):
    with fp.open("rb") as file:
        pdf = PDF(file)
    return pdf