def extract_text_from_pdf(): interruption_pdfs = InterruptionPdf.objects.filter(pdf_text__isnull=True) for interruption_pdf in interruption_pdfs: with open(interruption_pdf.pdf_file.path, "rb") as f: pdf = PDF(f) pdf_text, _ = InterruptionPdfText.objects.get_or_create( pdf=interruption_pdf, defaults={"pdf_text": "\n".join(pdf)}) print("Extracted text from PDF {}".format(pdf_text))
def _need_ocr(inpdf, minwords): with open(inpdf, "rb") as f: pdf = PDF(f) text = ' '.join(pdf) word_list = text.replace(',', '').replace('\'', '').replace('.', '').lower().split() if len(word_list) > minwords: return False, text else: return True, ""
def to_text(self) -> TextInterface: """ Converts the given PDF into text. If the PDF is not valid it just return an empty string :return: A string that represents the PDF """ if self.is_valid(): buffer = BytesIO(self.data) pdf = PDF(buffer) return self._text_adapter_klass('\n\n'.join(pdf)) return self._text_adapter_klass('')
def main(args): files = [x for x in os.listdir(args.path) if x[-4:] == ".pdf"] for file in files: with open(f"{args.path}/{file}", 'rb') as fd: pdf = PDF(fd) idx, name = 0, file.split('.')[0] for page in pdf: line = page.split("\n")[0].split() if len(line) == 10 and "CARTELERA" in line: with open(f"{args.folder}{name}-page{idx}.data", 'w') as fd: fd.write(page)
def readTextFromPdf(filename): text = "" with open(filename, "rb") as filehandle: try: # get array of pages, join them into single string, # split the string by new lines, # join it into a single string, strip whitespaces # now we got the whole text in a single line string text = " ".join(" ".join(PDF(filehandle)).split()).strip() except: logging.exception( f"Cannot extract text from {path.basename(filename)}.", exc_info=True) if text == "": raise Exception("PDF does not contain text.") return text
def analyze_pdf(self): if self.pdf: return file = self.get_djbr_filepath() with open(file, "rb") as f: self.pdf = PDF(f) self.pdf = "".join(self.pdf) # djbr:version pattern = '(versi.n).*' result = re_search(pattern, self.pdf, re_unicode) result = str(result.group()).split() for value in result: if value.replace('.', '').isdigit(): self.djbr_version = value break
def extract_corpora_from_file(filepath, file_index, file_extension="pdf") -> str: """ :type filepath: str :type file_index: int :type file_extension: str """ corpora = str() if file_extension == "pdf": with open(filepath, "rb") as file: pdf_file = PDF(file) corpora = "\n".join(pdf_file) else: with open(str(filepath), "r+", encoding='utf-8') as file: corpora = file.read() print( f"Processing of file [{file_index}] -> \'{filepath}\' was completed successfully!" ) return corpora
from os.path import splitext from pdftotext import PDF from textract import process from chardet import detect filenames=["Opsti-deo.pdf"] for filename in filenames: no_ext = splitext(filename)[0] with open(no_ext+"_pdftotext.txt", "w", encoding = "utf-8") as file: with open(filename, "rb") as f: pdf = PDF(f) for page in pdf: file.write(page) text = process(filename, method = "pdftotext") text = text.decode(detect(text)["encoding"]) with open(no_ext+"_textract.txt", "w", encoding = "utf-8") as file: file.write(text)
from pdftotext import PDF from selenium import webdriver options = webdriver.ChromeOptions() options.add_argument('--headless') driver_loc = '/Users/avinaashkoganti/chromedriver' #Scraping Index for VTI marketCaps = [] dates = [] r = requests.get("http://www.crsp.org/fact-sheet-archive") soup = BeautifulSoup(r.text,features="html.parser") for link in soup.find_all('a'): if re.search('crsptm', str(link.get('href')), re.IGNORECASE): pdf = requests.get(link.get('href')) open_pdf = io.BytesIO(pdf.content) read_pdf = PDF(open_pdf)[1] dates.append(link.text.replace('Quarter Ending ', '')) marketCaps.append(float(re.findall('INDEX MARKET CAP\s+\d+,\d+,\d+', read_pdf, re.IGNORECASE)[0].upper().replace('INDEX MARKET CAP','').replace(',',''))/1000) vtiDF = pd.DataFrame({'date':pd.to_datetime(dates, infer_datetime_format=True),'Market Cap':marketCaps}).sort_values(by='date', ascending=False).reset_index(drop=True) vtiDF.to_csv('Data/vti_MC_data.csv',index=False) print("VTI") print(vtiDF) #Scraping Index for VXUS dates = [] marketCaps = [] links = [] driver = webdriver.Chrome(driver_loc,options=options) driver.get('https://www.ftserussell.com/analytics/factsheets/home/search') archive = driver.find_element_by_xpath('//a[@title="FTSE Global All Cap ex US Index"]/following-sibling::a') driver.execute_script("arguments[0].click();", archive)
def read_pdf(path: str): with open(path, "rb") as file: return PDF(file)
def get_document(fp: Path): with fp.open("rb") as file: pdf = PDF(file) return pdf