示例#1
0
def download_files(from_date, to_date):
    """Downloads the new files from the EDP Sciences
    FTP server."""
    download_folder = join(CFG_EDPSCIENCE_OUT_FOLDER, 'packages')
    old_files = listdir(download_folder)
    ftp = FtpHandler(CFG_EDPSCIENCE_SERVER,
                     CFG_EDPSCIENCE_USERNAME,
                     CFG_EDPSCIENCE_PASSWORD)
    ftp.cd('incoming')
    new_files = ftp.ls()[0]
    new_files = filter(lambda a: is_younger(a,
                                            from_date,
                                            ftp),
                       new_files)
    files_to_download = filter(lambda a: a not in old_files,
                               new_files)
    counter = 1
    for filename in files_to_download:
        task_update_progress('Downloading files 1/3 \t%s of %s'
                             % (counter, len(new_files)))
        write_message('Downloading file %s' % (filename,))
        ftp.download(filename, download_folder)
        filename = join(download_folder, filename)
        counter += 1
    ftp.close()
    return map(lambda a: join(download_folder, a), new_files)
示例#2
0
def convert_files(files_to_convert, source_folder, marc_folder,
                  from_date, to_date):
    """Converts the xml source files to marc xml files"""
    converted_files = []
    journal_mappings = get_kbs()['journals'][1]
    edp = EDPSciencesPackage(journal_mappings)
    counter = 1
    for filename in files_to_convert:
        task_update_progress('Converting files 3/3 \t%s of %s' %
                             (counter, len(files_to_convert)))
        target_file = filename.split('/')[-1]
        target_file = join(marc_folder, target_file)
        target_folder = dirname(target_file)
        if not exists(target_folder):
            makedirs(target_folder)
        record = ""
        datestamp = edp.get_date(filename)
        if exists(target_file):
            if from_date and to_date:
                if datestamp >= from_date and\
                        datestamp <= to_date:
                    converted_files.append(target_file)
                elif from_date:
                    if datestamp >= from_date:
                        converted_files.append(target_file)
                elif to_date:
                    if datestamp <= to_date:
                        converted_files.append(target_file)
                else:
                    converted_files.append(target_file)
        else:
            if 'xml_rich' in filename:
                record = edp.get_record_rich(filename, refextract)
            else:
                if from_date and to_date:
                    if datestamp >= from_date and\
                            datestamp <= to_date:
                        record = edp.get_record(filename, refextract)
                elif from_date:
                    if datestamp >= from_date:
                        record = edp.get_record(filename, refextract)
                elif to_date:
                    if datestamp <= to_date:
                        record = edp.get_record(filename, refextract)
                else:
                    record = edp.get_record(filename, refextract)
            if record:
                write_message("Converted file: %s" % (filename,))
                with open(target_file, 'w') as out:
                    out.write(record)
                    converted_files.append(target_file)
        counter += 1
    return converted_files
示例#3
0
def create_collection(converted_files, new_files):
    """Creates the record collection file
    uploads it to the FTP server and sends
    an email to inform about the harvest"""
    target_file = "edpsciences.%s.xml" % \
                  (datetime.now().strftime("%Y-%m-%d"),)
    target_file = join(CFG_EDPSCIENCE_OUT_FOLDER, target_file)
    write_message("Creating collection file: %s" % (target_file,))
    with open(target_file, 'w') as collection:
        collection.write('<collection>\n')
        for fl in converted_files:
            recordfile = open(fl)
            collection.write(recordfile.read())
            recordfile.close()
        collection.write('\n</collection>')
    submit_records_via_ftp(target_file)
    body = ['From %s sources, found and converted %s records'
            % (len(new_files), len(converted_files)),
            '\t%s records ready to upload:\n'
            % (len(converted_files),),
            '\t%s uploaded to server:'
            % (target_file,)]
    body = '\n'.join(body)
    subject = "EDP Sciences harvest results: %s" % \
              (datetime.now().strftime("%Y-%m-%d %H:%M:%S"),)
    write_message(body)
    if submit_records_via_mail(subject, body, CFG_SITE_SUPPORT_EMAIL):
        write_message("Mail sent to %r" % (CFG_SITE_SUPPORT_EMAIL,))
    else:
        write_message("ERROR: Cannot send mail.")
示例#4
0
def extract_files(new_files, source_folder):
    """Extracts the tar files to the source_folder"""
    counter = 1
    extracted_files = []
    for filename in new_files:
        task_update_progress('Extracting files 2/3 \t%s of %s' %
                             (counter, len(new_files)))
        write_message('Extracting file %s' % (filename,))
        tar = tarfile.open(filename)
        files = [a.name for a in tar.getmembers()]
        tar.extractall(source_folder)
        extracted_files.extend(files)
        tar.close()
        counter += 1
    return [join(source_folder, a) for a in extracted_files]
示例#5
0
def bst_edpsciences_harvest(from_date="", to_date=""):
    """ Task to download and convert xml files from EDP Sciences
    FTP servers to Marc xml files.
    """
    source_folder = join(CFG_EDPSCIENCE_OUT_FOLDER, 'sources')
    if not exists(source_folder):
        makedirs(source_folder)
    marc_folder = join(CFG_EDPSCIENCE_OUT_FOLDER, 'marc')
    if not exists(marc_folder):
        makedirs(marc_folder)
    new_files = download_files(from_date, to_date)
    files_to_convert = []
    if new_files:
        task_sleep_now_if_required()
        files_to_convert = extract_files(new_files, source_folder)
        task_sleep_now_if_required(can_stop_too=True)
        converted_files = convert_files(files_to_convert, source_folder,
                                        marc_folder, from_date, to_date)
        if converted_files:
            create_collection(converted_files, new_files)
    else:
        write_message("No new files to download!")