def _move_to_done(obj, eng): target_folder_full = get_storage_path(suffix=target_folder) collections = obj.data.get('collections', dict()) for filename in collections.values(): move(filename, join(target_folder_full, basename(filename))) obj.log.info("Moved {0} to {1}".format( filename, target_folder_full) )
def _get_files_from_ftp(obj, eng): target_folder_full = get_storage_path(suffix=target_folder) obj.data['all_files'], obj.data['new_files'] = ftp_download_files( source_folder, target_folder_full, server=server, netrc_file=get_netrc() ) obj.log.info("{0} new files downloaded, in total {1} files".format( len(obj.data["new_files"]), len(obj.data["all_files"]) ))
def _get_files_from_ftp(obj, eng): netrc_file = obj.extra_data["config"].get("ftp_netrc_file") target_folder_full = get_storage_path(suffix=target_folder) obj.data['all_files'], obj.data['new_files'] = ftp_download_files( source_folder, target_folder_full, server=obj.extra_data["config"]["ftp_server"], netrc_file=netrc_file ) obj.log.info("{0} new files downloaded, in total {1} files".format( len(obj.data["new_files"]), len(obj.data["all_files"]) ))
def _convert_files(obj, eng): from invenio_knowledge.api import get_kb_mappings mappings = dict( map( lambda item: (item['key'], item['value']), get_kb_mappings('JOURNALS') ) ) ws = WorldScientific(mappings) target_folder_full = get_storage_path(suffix=target_folder) args = obj.extra_data['args'] # By default, we set the from date as today to_date = args.get("to_date") or datetime.now().strftime('%Y-%m-%d') # By last resort, we set the from date a week before from_date = args.get("from_date") or cache.get(date_key) \ or (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d') obj.extra_data['args']["to_date"] = to_date obj.extra_data['args']["from_date"] = from_date insert_files = [] filenames = obj.data['extracted_files'] for filename in filenames: date = ws.get_date(filename) if from_date <= date <= to_date: marc = ws.get_record(filename) if marc: filename = basename(filename) filename = join(target_folder_full, filename) insert_files.append(filename) with open(filename, 'w') as outfile: outfile.write(marc) obj.log.info("Converted {0} articles between {1} to {2}".format( len(insert_files), from_date, to_date )) obj.data['insert'] = insert_files obj.data["result_path"] = target_folder_full obj.log.debug("Saved converted files to {0}".format(target_folder_full)) obj.log.debug("{0} files to add".format( len(obj.data["insert"]), ))
def _unzip_files(obj, eng): target_folder_full = get_storage_path(suffix=target_folder) filenames = obj.data.get('all_files', list()) extracted_files = [] for filename in filenames: try: extracted_files.extend(unzip(filename, target_folder_full)) except BadZipfile as e: obj.log.error("Error unzipping file {0}: {1}".format( filename, e )) pass obj.data['extracted_files'] = extracted_files obj.log.debug("{0} new files extracted".format( len(obj.data["extracted_files"]) ))
def _convert_files(obj, eng): from invenio_knowledge.api import get_kb_mappings mappings = dict( map( lambda item: (item['key'], item['value']), get_kb_mappings('JOURNALS') ) ) ws = WorldScientific(mappings) target_folder_full = get_storage_path(suffix=target_folder) args = obj.extra_data['args'] # By default, we set the from date as today to_date = args.get("to_date") or datetime.now().strftime('%Y-%m-%d') # By last resort, we set the from date months before from_date = args.get("from_date") if not from_date: if args.get("reharvest"): # Since "beginning" of time when not specified from_date = datetime.strptime("1900-01-01", "%Y-%m-%d") else: # Dynamic date in the past when not specified and not reharvest from_date = datetime.now() - timedelta(weeks=weeks_threshold)\ .strftime('%Y-%m-%d') obj.extra_data['args']["to_date"] = to_date obj.extra_data['args']["from_date"] = from_date insert_files = [] if args.get("reharvest"): filenames = obj.data['all_extracted_files'] else: filenames = obj.data['newly_extracted_files'] for filename in filenames: date = ws.get_date(filename) if date is None or (from_date <= date <= to_date): marc = ws.get_record(filename) if marc: filename = basename(filename) filename = join(target_folder_full, filename) insert_files.append(filename) with open(filename, 'w') as outfile: outfile.write(marc) else: obj.log.info("Filtered out {0} ({1})".format(filename, date)) obj.log.info("Converted {0}/{1} articles between {2} to {3}".format( len(insert_files), len(filenames), from_date, to_date )) obj.data['insert'] = insert_files obj.data["result_path"] = target_folder_full obj.log.debug("Saved converted files to {0}".format(target_folder_full)) obj.log.debug("{0} files to add".format( len(obj.data["insert"]), ))