def _import_heading(self, heading_filename): """Imports the heading information about the Work.""" self.logger.debug('_import_heading {0}'.format(heading_filename)) # creates a new PDFParser to read the contents of the heading file parser = PDFParser(heading_filename) # gets the content of the PDF file heading = parser.get_text_content() return heading
def count_words(run_once=False): txt_directory = '/Users/samuelkaeser/Documents/University/Classes/EE_460J/Homework/Lab5/txts' txts = [f for f in os.listdir(txt_directory)] parser = PDFParser() for txt in txts: txt_path = os.path.join(txt_directory, txt) with open(txt_path, 'r') as f: for line in f: parser.parse(line) if run_once: break return parser.word_counts, parser.total_words
def _import_impression (self, work, publishers_page, f_path): # creates a new PDFParser to get the impression self.logger.debug('Parsing {}'.format(f_path)) parser = PDFParser(f_path) code = parser.get_impression_code() if code: self.logger.debug('Impression: ' + code) # Create an Impression PDF Document. document = Document(title=code) with open(f_path, 'rb') as fh: pdf_file = File(fh) document.file.save(os.path.basename(f_path), pdf_file) document.tags.add('impression') # creates a new impression impression = Impression() impression.title = code impression.impression_title = parser.get_title() impression.content = parser.get_text_content() impression.pdf = document try: sort_order = self._order_of_impressions.index(code.lower()) except Exception: self.logger.error( u'{0} missing from order of impressions, which consists of: {1}'.format(code, ', '.join(self._order_of_impressions))) sort_order = 999 impression.sort_order = sort_order impression.slug = safe_slugify(impression.title, Impression) impression.comments = parser.get_comments() self._import_copies(impression, parser, code) publisher_code = impression.title.split('-')[-1] publisher = Publisher.objects.filter(title=publisher_code).first() if not publisher: publisher = Publisher(title=publisher_code) publisher.slug = slugify(publisher_code) publishers_page.add_child(instance=publisher) impression.publisher = publisher work.add_child(instance=impression)
def update_uid_list(): ep = ElsevierParser() sp = SpringerParser() pp = PMCParser() pdf_p = PDFParser() rp = RSCParser() #ep.update_uid_list() #sp.update_uid_list() #pp.update_uid_list() #pdf_p.update_uid_list('APS') #pdf_p.update_uid_list('ACS') #pdf_p.update_uid_list('Wiley') #pdf_p.update_uid_list('IUCr') #pdf_p.update_uid_list('RSC') #pdf_p.update_uid_list('IOP_JSON') rp.update_uid_list()
def _import_impression(self, work, publishers_page, f_path): # creates a new PDFParser to get the impression self.logger.debug('Parsing {}'.format(f_path)) parser = PDFParser(f_path) code = parser.get_impression_code() if code: self.logger.debug('Impression: ' + code) # Create an Impression PDF Document. document = Document(title=code) with open(f_path, 'rb') as fh: pdf_file = File(fh) document.file.save(os.path.basename(f_path), pdf_file) document.tags.add('impression') # creates a new impression impression = Impression() impression.title = code impression.impression_title = parser.get_title() impression.content = parser.get_text_content() impression.pdf = document try: sort_order = self._order_of_impressions.index(code.lower()) except Exception: self.logger.error( u'{0} missing from order of impressions, which consists of: {1}' .format(code, ', '.join(self._order_of_impressions))) sort_order = 999 impression.sort_order = sort_order impression.slug = safe_slugify(impression.title, Impression) impression.comments = parser.get_comments() self._import_copies(impression, parser, code) publisher_code = impression.title.split('-')[-1] publisher = Publisher.objects.filter(title=publisher_code).first() if not publisher: publisher = Publisher(title=publisher_code) publisher.slug = slugify(publisher_code) publishers_page.add_child(instance=publisher) impression.publisher = publisher work.add_child(instance=impression)
# STEP 1: import parser class from pdf_parser import PDFParser # STEP 2: instantiate class parser = PDFParser() # STEP 3: get available forms (i9, etc.) forms = parser.available_forms() print(forms) # STEP 4: get form details (i.e. array of fields/questions object) details = parser.form_details('dmv44') print(details) # STEP 5: fill form with form_name and answers dict, returns bytes dummy_answers = { 'ssn': '123456789', 'first_name': 'lil\'', 'last_name': 'pea', 'middle_name': '', 'address': '21 Pea Rd', 'apt_number': '1c', 'city': 'New York', 'state': 'NY', 'date_of_birth': '02022019', 'telephone_number': '917-PEA-PEA', 'applying_for': 1, 'purpose': 4, 'organ': 1 }
def parse_PDF(): pdf_p = PDFParser() ''' dir = "/home/gpark/corpus_web/tdm/archive/RSC" for filename in os.listdir(dir): if filename.endswith(".pdf"): print(filename) pdf_p.parse(os.path.join(dir, filename)) input("Press Enter to continue...") ''' cnt_article_w_keyword = 0 terms = ['EXAFS', 'XANES', 'NEXAFS', 'pair distribution function'] terms = [x.lower() for x in terms] # lowercase num_of_files = 0 #check_point_found = False dir = "/home/gpark/corpus_web/tdm/archive/IOP_JSON" # debugging file_doi = {} for file in os.listdir(dir): if file.endswith(".json"): with open(os.path.join(dir, file), "r") as read_file: data = json.load(read_file) body_text = data['body_text'] found = False for sent in body_text: tokens = sent['sent'].split() tokens = [x.lower() for x in tokens] if any(elem in tokens for elem in terms[:3]): found = True break if found is True: pdf_file = file.replace('.json', '.pdf') file_doi[pdf_file] = data['uid'] with open("iop_filtered_list.txt", 'a') as out_file: for file, doi in file_doi.items(): out_file.write(file + ' -> https://doi.org/' + doi + '\n') sys.exit() # debugging for root, dirs, files in os.walk(dir): dirs.sort( reverse=True ) # it will traverse the subdirectories in reverse lexicographic order of their names. for file in files: if file.endswith(".pdf"): ''' when an error occurs, to start after the last processed file. if check_point_found == False: if file == 'epl_38_6_453.pdf': check_point_found = True continue else: continue ''' iop_meta_file = os.path.join(root, '.article') if os.path.exists(iop_meta_file) == False: continue pdf_p.parse(os.path.join(root, file), terms, iop_meta_file) num_of_files += 1 print('>> file: ', os.path.join(root, file), ' / num_of_files: ', num_of_files) #input("Press Enter to continue...") #if file in ['jpmater_1_1_01LT02.pdf', 'jpmater_1_1_015010.pdf', 'jpmater_1_1_015006.pdf', 'mfm_1_1_015005.pdf']: #if file in ['jpco_3_1_015002.pdf']: # input("Press Enter to continue...") #if pdf_p.parse(os.path.join(root, file), terms) == True: # len(body_text) == 0 -> True # cnt_article_w_keyword += 1 print(cnt_article_w_keyword)
def start_questions(recipient_id, payload, txt=None): global user_data if not payload: payload = user_data[recipient_id]["current_form"] if recipient_id in user_data and 'done' in user_data[ recipient_id] and user_data[recipient_id]['done'] == True: if txt.isdigit(): account_sid = 'AC67c8a0b6b16986da80dc1ac0fdb26808' auth_token = '1db9a6a11f38faafa562d1e72607ba39' client = Client(account_sid, auth_token) message = client.messages \ .create( body=user_data[recipient_id]["public_url"], from_='+12153911286', to='+1'+txt ) else: print("form is already filled. please reset") return "" payload_correct = ''.join( [i.lower() for i in payload if i.isalpha() or i.isdigit()]) print(payload) print(payload_correct) parser = PDFParser() details = parser.form_details(payload_correct) print(details) txt_trans = "" if txt is not None: translate_client = translate.Client() txt_trans = translate_client.translate(txt, target_language='en') txt_trans = txt_trans['translatedText'] if recipient_id in user_data and 'in_progress' in user_data[recipient_id]: print("in progress") current_key = details[len(user_data[recipient_id]["answers"])]['id'] user_data[recipient_id]["answers"][current_key] = txt_trans if len(user_data[recipient_id]["answers"]) == len(details): print("done!") user_data[recipient_id]['done'] = True filled_form = parser.fill_form( user_data[recipient_id]["current_form"], user_data[recipient_id]["answers"]) import time timestamp = int(time.time()) save(filled_form, 'files/filled_test-{}.pdf'.format(timestamp)) user_info = bot.get_user_info(recipient_id) fname = user_info["first_name"] lname = user_info["last_name"] pdf_form = user_data[recipient_id]["current_form"] translate_client = translate.Client() target_lang = user_data[recipient_id]['lang'] translated_text = translate_client.translate( "You are done! Here is your file", target_language=target_lang) translated_text = translated_text['translatedText'] translated_text = translated_text bot.send_text_message(recipient_id, translated_text) storage_client = storage.Client() bucket_name = 'ezpz-files-public' bucket = storage_client.get_bucket(bucket_name) source_file_name = "/Users/tomeraharoni/Documents/Projects/devfest/files/filled_test-{}.pdf".format( timestamp) destination_blob_name = "{}-{}-{}-{}-filled.pdf".format( pdf_form, fname, lname, timestamp) blob = bucket.blob(destination_blob_name) blob.upload_from_filename(source_file_name) blob.make_public() bot.send_text_message(recipient_id, blob.public_url) user_data[recipient_id]['public_url'] = blob.public_url target_lang = user_data[recipient_id]['lang'] translated_text = translate_client.translate( "If you want the file sent to your phone, please type in your number", target_language=target_lang) translated_text = translated_text['translatedText'] bot.send_text_message(recipient_id, translated_text) return question_object = details[len(user_data[recipient_id]["answers"])] question_type = question_object["type"] question_text = question_object["question"] added_string = "" if question_type == "bool": added_string = "(Yes / No)" text_to_send = "{} {}".format(question_text, added_string) translate_client = translate.Client() target_lang = user_data[recipient_id]['lang'] if target_lang == 'en': translated_text = text_to_send else: translated_text = translate_client.translate( text_to_send, target_language=target_lang) translated_text = translated_text['translatedText'] bot.send_text_message(recipient_id, translated_text) else: print("first question!") user_data[recipient_id]["in_progress"] = True user_data[recipient_id]["answers"] = {} user_data[recipient_id]["current_form"] = payload_correct translate_client = translate.Client() target_lang = user_data[recipient_id]['lang'] filling_intro_text = 'Sure! I can help you with your {} form'.format( payload) filling_intro_text_tra = translate_client.translate( filling_intro_text, target_language=target_lang) filling_intro_text_tra = filling_intro_text_tra['translatedText'] bot.send_text_message(recipient_id, filling_intro_text_tra) question_object = details[0] question_type = question_object["type"] question_text = question_object["question"] added_string = "" if question_type == "bool": added_string = "(Yes / No)" elif question_type == "option": q_options = question_object["options"] added_string = "\n" for i in range(len(q_options)): added_string = added_string + \ "\n{}) {}".format(i, q_options[i]) text_to_send = "{} {}".format(question_text, added_string) if target_lang == 'en': translated_text = text_to_send else: translated_text = translate_client.translate( text_to_send, target_language=target_lang) translated_text = translated_text['translatedText'] bot.send_text_message(recipient_id, translated_text)