def create_collection(converted_files, new_files): """Creates the record collection file uploads it to the FTP server and sends an email to inform about the harvest""" target_file = "edpsciences.%s.xml" % \ (datetime.now().strftime("%Y-%m-%d"),) target_file = join(CFG_EDPSCIENCE_OUT_FOLDER, target_file) write_message("Creating collection file: %s" % (target_file,)) with open(target_file, 'w') as collection: collection.write('<collection>\n') for fl in converted_files: recordfile = open(fl) collection.write(recordfile.read()) recordfile.close() collection.write('\n</collection>') submit_records_via_ftp(target_file) body = ['From %s sources, found and converted %s records' % (len(new_files), len(converted_files)), '\t%s records ready to upload:\n' % (len(converted_files),), '\t%s uploaded to server:' % (target_file,)] body = '\n'.join(body) subject = "EDP Sciences harvest results: %s" % \ (datetime.now().strftime("%Y-%m-%d %H:%M:%S"),) write_message(body) if submit_records_via_mail(subject, body, CFG_SITE_SUPPORT_EMAIL): write_message("Mail sent to %r" % (CFG_SITE_SUPPORT_EMAIL,)) else: write_message("ERROR: Cannot send mail.")
def create_collection(batch_size, new_files, new_sources, directory, submit): """Create a single xml file "collection.xml" that contains all the records.""" subject = "Consyn harvest results: %s" % \ (datetime.now().strftime("%Y-%m-%d %H:%M:%S"),) batch = 1 counter = 1 files_to_upload = [] collection = None date = datetime.now().strftime("%Y.%m.%d") prefix = "elsevier-{0}".format(date) for filename in new_files: if counter == 1: filepath = get_available_filename(directory, prefix, batch) collection = open(filepath, 'w') collection.write("<collection>\n") with open(filename) as f: collection.write(f.read() + '\n') counter += 1 if counter == batch_size: collection.write("</collection>") collection.close() files_to_upload.append(filepath) counter = 1 batch += 1 if counter < batch_size and collection: collection.write("</collection>") collection.close() files_to_upload.append(filepath) body = [ 'From %s sources, found and converted %s records' % (len(new_sources), len(new_files)), '\t%s records ready to upload:\n' % ((batch - 1) * 500 + counter, ) ] if submit: body += ['\tFiles uploaded to Server:'] for filepath in files_to_upload: try: submit_records_via_ftp(filepath) filename = filepath.split('/')[-1] body.append("\t%s (%s records)" % (filename, batch_size)) except: _errors_detected.append( Exception("Failed to upload %s to FTP server" % filepath)) write_message("Failed to upload %s to FTP server" % filepath) else: body += ['\tFiles ready for upload:'] for filename in files_to_upload: body.append("\t%s (%s records)" % (filename, batch_size)) if files_to_upload: body = '\n'.join(body) write_message(subject) write_message(body) if submit: if submit_records_via_mail(subject, body, CFG_CONSYNHARVEST_EMAIL): write_message("Mail sent to %r" % (CFG_CONSYNHARVEST_EMAIL, )) else: write_message("ERROR: Cannot send mail.") else: write_message("No new files!")
def create_collection(batch_size, new_files, new_sources, directory, upload_FTP): """Create a single xml file "collection.xml" that contains all the records.""" subject = "Consyn harvest results: %s" % \ (datetime.now().strftime("%Y-%m-%d %H:%M:%S"),) if new_files: batch = 1 counter = 0 date = datetime.now().strftime("%Y.%m.%d") filepath = "elsevier-%s-%s.xml" % (date, batch) filepath = join(directory, filepath) filepath = filepath.lstrip() files_to_upload = [] with open(filepath, 'w') as collection: collection.write("<collection>\n") for f in new_files: if counter == batch_size: counter = 0 batch += 1 collection.write("</collection>") collection.close() files_to_upload.append(filepath) filepath = "elsevier-%s-%s.xml" % (date, batch) filepath = join(directory, filepath) filepath = filepath.lstrip() collection = open(filepath, 'w') collection.write("<collection>\n") xmlFile = open(f, 'r') xmlString = xmlFile.read() xmlFile.close() collection.write(xmlString + '\n') counter += 1 collection.write("</collection>") files_to_upload.append(filepath) body = ['From %s sources, found and converted %s records' % (len(new_sources), len(new_files)), '\t%s records ready to upload:\n' % ((batch - 1) * 500 + counter,)] if upload_FTP: body += ['\tFiles uploaded to Server:'] else: body += ['\tFiles ready for upload:'] for filepath in files_to_upload: try: submit_records_via_ftp(filepath) filename = filepath.split('/')[-1] body.append("\t%s (%s records)" % (filename, batch_size)) except: write_message("Failed to upload %s to FTP server" % filepath) if len(body) > 3: #update the last line of the message body[-1] = "\t%s (%s records)" % (filename, counter) body = '\n'.join(body) write_message(subject) write_message(body) report_records_via_mail(subject, body) else: write_message(subject) write_message("No new files")
def submit_records(self, records_filename, mode, update=False, taskid=0, silent=False): """ Performs the logic to submit given file (filepath) of records either by e-mail or using BibUpload with given mode. Taskid is given to indicate if the task submission should wait for any previously submitted tasks. The submission can also be made "silent" in the sense of not updating the modification date of the records. @param records_filename: filepath to XML file containing records. @type records_filename: string @param records_list: list of APSRecord objects for records @type records_list: list @param mode: which submission mode is it? @type mode: string @param taskid: bibsched taskid, wait for task to complete before submission @type taskid: int @param silent: do not update the modification date of the records @type silent: bool @return: returns the given taskid upon submission, or True/False from email. """ if update: records_list = self.records_to_update else: records_list = self.records_to_insert # Check if we should create bibupload or e-mail if mode == "email": # Lets parse the records and find our IDs. list_of_dois = [] for record in records_list: # We strip away the first part of the DOI for readability. list_of_dois.append('/'.join(record.doi.split('/')[1:])) # We send an e-mail to CFG_APSHARVEST_EMAIL and put file on AFS. body = "Harvested new records: %s" % (records_filename,) try: try: shutil.move(records_filename, self.out_folder) records_filename = os.path.join(self.out_folder, os.path.basename(records_filename)) body = "Harvested new records on %s. They are located here:\n %s" % \ (self.date_started.strftime("%Y-%m-%d %H:%M:%S"), records_filename) except IOError, e: # Some IOError body = "Error while harvesting records: \nError saving %s - %s" % \ (records_filename, str(e)) raise e finally: submit_records_via_ftp(records_filename) body = "%s\nRecords harvested (%s total):\n%s\n" % (body, str(len(list_of_dois)), "\n".join(list_of_dois)) body = "%s\nUploaded to FTP: %s" % ( body, os.path.basename(records_filename) ) res = submit_records_via_mail(self.mail_subject, body, CFG_APSHARVEST_EMAIL) write_message("Sent e-mail to %s with path to %s" % (CFG_APSHARVEST_EMAIL, records_filename)) return res else: # We submit a BibUpload task and wait for it to finish task_update_progress("Waiting for task to finish") if taskid != 0: write_message("Going to wait for %d to finish" % (taskid,)) while not can_launch_bibupload(taskid): # Lets wait until the previously launched task exits. task_sleep_now_if_required(can_stop_too=False) time.sleep(5.0) taskid = submit_bibupload_for_records(mode, records_filename, silent) write_message("Submitted BibUpload task #%s with mode %s" % (str(taskid), mode)) return taskid
def main(args): if len(args) != 1: print("usage: python bibfilter_oaipos2inspire.py input_filename") raise Exception("Wrong usage!!") input_filename = args[0] out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY) insert_records = [] append_records = [] error_records = [] files_uploaded = [] pos = PosPackage() xml_doc = parse(input_filename) for record in xml_doc.getElementsByTagName('record'): rec = pos.get_record(record) identifier = pos.get_identifier() conference = identifier.split(':')[2] conference = conference.split('/')[0] contribution = identifier.split(':')[2] contribution = contribution.split('/')[1] identifier = "PoS(%s)%s" % (conference, contribution) query = "773__p:pos 773__v:%s 773__c:%s" % \ (conference.replace(' ', ''), contribution) print("Querying with: %s" % (query, )) results = perform_request_search(p=query, of="id") #harvest fulltext url = base_url + identifier session = requests.session() r = session.get(url) parsed_html = BeautifulSoup(r.text) links = parsed_html.body.findAll('a') found = False for link in links: url = urllib.quote(link['href'], safe=":/") if url.endswith('.pdf'): found = True if results: rec = create_record() filename = join(out_folder, identifier + ".pdf") record_add_field(rec, '856', ind1='4', subfields=[('u', url), ('y', 'PoS server')]) record_add_field(rec, 'FFT', subfields=[('a', filename), ('t', 'PoS'), ('d', 'Fulltext')]) try: print('Downloading ' + url) download_url(url, "pdf", filename, 5, 60.0) if results: recid = results[0] record_add_field(rec, '001', controlfield_value=recid) append_records.append(rec) else: insert_records.append(rec) except InvenioFileDownloadError: print("Download of %s failed" % (url, )) break if not found: error_records.append(rec) #upload to FTP tempfile_path = '/tmp/%s.xml' % (contribution, ) with open(tempfile_path, 'w') as tempfile: tempfile.write(record_xml_output(rec)) try: submit_records_via_ftp(tempfile_path, conference) files_uploaded.append('%s/%s.xml' % (conference, contribution)) write_message("%s successfully uploaded to FTP server" % tempfile_path) except: write_message("Failed to upload %s to FTP server" % tempfile_path) remove(tempfile_path) insert_filename = "%s.insert.xml" % (input_filename, ) append_filename = "%s.append.xml" % (input_filename, ) errors_filename = "%s.errors.xml" % (input_filename, ) created_files = [] if write_record_to_file(insert_filename, insert_records): copy(insert_filename, out_folder) created_files.append(join(out_folder, basename(insert_filename))) if write_record_to_file(append_filename, append_records): copy(append_filename, out_folder) created_files.append(join(out_folder, basename(append_filename))) if write_record_to_file(errors_filename, error_records): copy(errors_filename, errors_filename) created_files.append(join(out_folder, basename(errors_filename))) total_records = len(append_records) + len(insert_records) + len( error_records) subject = "PoS Harvest results: " + datetime.now().strftime( "%Y-%m-%d %H:%M:%S") body = """ Total of %d records processed: %d new records, %d records already existing in the system, %d records that failed to retrieve the fulltext Location of new records: %s """ % \ (total_records, len(insert_records), len(append_records), len(error_records), "\n".join(created_files)) if files_uploaded: body += "\nFiles uploaded:" for fl in files_uploaded: body += "\n\t%s file uploaded on the FTP Server\n" % (fl, ) write_message(subject) write_message(body) if not send_email(CFG_SITE_SUPPORT_EMAIL, CFG_POSHARVEST_EMAIL, subject, body): print("ERROR: Mail not sent") else: print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL, ))
def main(args): if len(args) != 1: print("usage: python bibfilter_oaipos2inspire.py input_filename") raise Exception("Wrong usage!!") input_filename = args[0] out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY) insert_records = [] append_records = [] error_records = [] files_uploaded = [] pos = PosPackage() xml_doc = parse(input_filename) for record in xml_doc.getElementsByTagName('record'): rec = pos.get_record(record) identifier = pos.get_identifier() conference = identifier.split(':')[2] conference = conference.split('/')[0] contribution = identifier.split(':')[2] contribution = contribution.split('/')[1] identifier = "PoS(%s)%s" % (conference, contribution) query = "773__p:pos 773__v:%s 773__c:%s" % \ (conference.replace(' ', ''), contribution) print("Querying with: %s" % (query,)) results = perform_request_search(p=query, of="id") #harvest fulltext url = base_url + identifier session = requests.session() r = session.get(url) parsed_html = BeautifulSoup(r.text) links = parsed_html.body.findAll('a') found = False for link in links: url = urllib.quote(link['href'], safe=":/") if url.endswith('.pdf'): found = True if results: rec = create_record() filename = join(out_folder, identifier + ".pdf") record_add_field(rec, '856', ind1='4', subfields=[ ('u', url), ('y', 'PoS server') ]) record_add_field(rec, 'FFT', subfields=[('a', filename), ('t', 'PoS'), ('d', 'Fulltext')]) try: print('Downloading ' + url) download_url(url, "pdf", filename, 5, 60.0) if results: recid = results[0] record_add_field(rec, '001', controlfield_value=recid) append_records.append(rec) else: insert_records.append(rec) except InvenioFileDownloadError: print("Download of %s failed" % (url,)) break if not found: error_records.append(rec) #upload to FTP tempfile_path = '/tmp/%s.xml' % (contribution,) with open(tempfile_path, 'w') as tempfile: tempfile.write(record_xml_output(rec)) try: submit_records_via_ftp(tempfile_path, conference) files_uploaded.append('%s/%s.xml' % (conference, contribution)) write_message("%s successfully uploaded to FTP server" % tempfile_path) except: write_message("Failed to upload %s to FTP server" % tempfile_path) remove(tempfile_path) insert_filename = "%s.insert.xml" % (input_filename,) append_filename = "%s.append.xml" % (input_filename,) errors_filename = "%s.errors.xml" % (input_filename,) created_files = [] if write_record_to_file(insert_filename, insert_records): copy(insert_filename, out_folder) created_files.append(join(out_folder, basename(insert_filename))) if write_record_to_file(append_filename, append_records): copy(append_filename, out_folder) created_files.append(join(out_folder, basename(append_filename))) if write_record_to_file(errors_filename, error_records): copy(errors_filename, errors_filename) created_files.append(join(out_folder, basename(errors_filename))) total_records = len(append_records) + len(insert_records) + len(error_records) subject = "PoS Harvest results: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S") body = """ Total of %d records processed: %d new records, %d records already existing in the system, %d records that failed to retrieve the fulltext Location of new records: %s """ % \ (total_records, len(insert_records), len(append_records), len(error_records), "\n".join(created_files)) if files_uploaded: body += "\nFiles uploaded:" for fl in files_uploaded: body += "\n\t%s file uploaded on the FTP Server\n" % (fl,) write_message(subject) write_message(body) if not send_email(CFG_SITE_SUPPORT_EMAIL, CFG_POSHARVEST_EMAIL, subject, body): print("ERROR: Mail not sent") else: print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL,))
def create_collection(batch_size, new_files, new_sources, directory, submit): """Create a single xml file "collection.xml" that contains all the records.""" subject = "Consyn harvest results: %s" % \ (datetime.now().strftime("%Y-%m-%d %H:%M:%S"),) batch = 1 counter = 1 date = datetime.now().strftime("%Y.%m.%d") files_to_upload = [] collection = None for filename in new_files: if counter == 1: filepath = "elsevier-%s-%s.xml" % (date, batch) filepath = join(directory, filepath) filepath = filepath.lstrip() collection = open(filepath, 'w') collection.write("<collection>\n") with open(filename) as f: collection.write(f.read() + '\n') counter += 1 if counter == batch_size: collection.write("</collection>") collection.close() files_to_upload.append(filepath) counter = 1 batch += 1 if counter < batch_size and collection: collection.write("</collection>") collection.close() files_to_upload.append(filepath) body = ['From %s sources, found and converted %s records' % (len(new_sources), len(new_files)), '\t%s records ready to upload:\n' % ((batch - 1) * 500 + counter,)] if submit: body += ['\tFiles uploaded to Server:'] for filepath in files_to_upload: try: submit_records_via_ftp(filepath) filename = filepath.split('/')[-1] body.append("\t%s (%s records)" % (filename, batch_size)) except: _errors_detected.append(Exception( "Failed to upload %s to FTP server" % filepath) ) write_message("Failed to upload %s to FTP server" % filepath) else: body += ['\tFiles ready for upload:'] for filename in files_to_upload: body.append("\t%s (%s records)" % (filename, batch_size)) if len(body) > 3: #update the last line of the message body[-1] = "\t%s (%s records)" % (filename, counter) body = '\n'.join(body) write_message(subject) write_message(body) else: write_message(subject) write_message("No new files!") if submit: if submit_records_via_mail(subject, body, CFG_CONSYNHARVEST_EMAIL): write_message("Mail sent to %r" % (CFG_CONSYNHARVEST_EMAIL,)) else: write_message("ERROR: Cannot send mail.")
def create_collection(batch_size, new_files, new_sources, directory, upload_FTP): """Create a single xml file "collection.xml" that contains all the records.""" subject = "Consyn harvest results: %s" % \ (datetime.now().strftime("%Y-%m-%d %H:%M:%S"),) if new_files: batch = 1 counter = 0 date = datetime.now().strftime("%Y.%m.%d") filepath = "elsevier-%s-%s.xml" % (date, batch) filepath = join(directory, filepath) filepath = filepath.lstrip() files_to_upload = [] with open(filepath, 'w') as collection: collection.write("<collection>\n") for f in new_files: if counter == batch_size: counter = 0 batch += 1 collection.write("</collection>") collection.close() files_to_upload.append(filepath) filepath = "elsevier-%s-%s.xml" % (date, batch) filepath = join(directory, filepath).lstrip() filepath = filepath.lstrip() collection = open(filepath, 'w') collection.write("<collection>\n") xmlFile = open(f, 'r') xmlString = xmlFile.read() xmlFile.close() collection.write(xmlString + '\n') counter += 1 collection.write("</collection>") files_to_upload.append(filepath) body = [ 'From %s sources, found and converted %s records' % (len(new_sources), len(new_files)), '\t%s records ready to upload:\n' % ((batch - 1) * 500 + counter, ) ] if upload_FTP: body += ['\tFiles uploaded to Server:'] else: body += ['\tFiles ready for upload:'] for filepath in files_to_upload: try: submit_records_via_ftp(filepath) filename = filepath.split('/')[-1] body.append("\t%s (%s records)" % (filename, batch_size)) except: write_message("Failed to upload %s to FTP server" % filepath) if len(body) > 3: #update the last line of the message body[-1] = "\t%s (%s records)" % (filename, counter) body = '\n'.join(body) write_message(subject) write_message(body) if submit_records_via_mail(subject, body, CFG_CONSYNHARVEST_EMAIL): write_message("Mail sent to %r" % (CFG_CONSYNHARVEST_EMAIL, )) else: write_message("ERROR: Cannot send mail.") else: write_message(subject) write_message("No new files")