import slate, subprocess, pyttsx loc = raw_input('Enter location of pdf:') with open(loc) as f: doc = slate.PDF(f) choice = input("Speak(0)/Save(1):") if choice == 1: page = input('Enter number of to save in 1 file(0 for all):') if page == 0: page = len(doc) i = 0 while i < len(doc): t = "" for j in range(i, i + page): if i + j >= len(doc): break t = t + doc[j - 1] name = (str)(i) + " to " + (str)(i + page) i = i + page subprocess.call(["espeak", "-w" + name + ".mp3", t]) else: page = input("Enter page number to speak:") t = doc[page - 1] engine = pyttsx.init() engine.say(doc[page - 1]) engine.runAndWait()
#!/usr/bin/env python2 import slate import sys fb = ( 'one two three four five six seven eight nine ten ' + 'eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen ' + 'twenty twenty-one twenty-two twenty-three twenty-four twenty-five twenty-six twenty-seven' ) lfn = fb.split() txt = '' for fx in lfn: fn = 'page-%s.pdf' % fx with open(fn) as f: t = slate.PDF(f) txt += t[0][0:-1] with open(sys.argv[1], 'w') as f: f.write(txt) # python 03-pages.py > pages.txt # base64 -d pages.txt > pages.png # view the png "We like the moon" Joel and Alex Veitch rathergood.com # Hmmm # exiftool pages.png -> shows some text in the comment
def test_pdf_generation(self): pdf = StringIO(generator(source=SAMPLE_TEXT, header='header')) text = slate.PDF(pdf) self.assertEqual(9, len(text)) self.assertIn('Lorem ipsum dolor', text[0]) self.assertIn('Sed egestas dui tellus', text[4])
def preprocess_pdf(pdf): #had to edit the next line because there was an error. #annotators = 'tokenize, ssplit, pos, lemma, ner, entitymentions, coref, sentiment, quote, openie' annotators = 'tokenize, ssplit, pos, lemma, ner' options = {'openie.resolve_coref': True} nlp = StanfordCoreNLP(annotators=annotators, options=options) with open(pdf, 'rb') as f: doc = slate.PDF(f) doc = ' '.join([' '.join(x.split()) for x in doc]) text_split = doc.split('Abstract') if len(text_split) > 1: text_no_title = ' '.join(text_split[1:]) text_no_title = str(text_no_title).encode('latin-1', 'ignore') text_no_title = text_no_title[0:1000] document = nlp(text_no_title) first_sentence = document[0] #only formats the first 2 sentences in the abstract for token in first_sentence: if str(token).isalpha(): if str(token).islower(): print(str(token).lower(), '│L│', token.pos, '│', token.ner, '│-', end=" ", sep='') else: print(str(token).lower(), '│U│', token.pos, '│', token.ner, '│-', end=" ", sep='') else: print(str(token).lower(), '│L│', token.pos, '│', token.ner, '│-', end=" ", sep='') print('\n') second_sentence = document[1] for token in second_sentence: if str(token).isalpha(): if str(token).islower(): print(str(token).lower(), '│L│', token.pos, '│', token.ner, '│-', end=" ", sep='') else: print(str(token).lower(), '│U│', token.pos, '│', token.ner, '│-', end=" ", sep='') else: print(str(token).lower(), '│L│', token.pos, '│', token.ner, '│-', end=" ", sep='') print('\n')