Exemplo n.º 1
0
    def _get_metadata_and_fulltex_dir(self):
        # Prints stuff
        print >> sys.stdout, "\nRetrieving journal items directories."
        # Create progrss bar
        p_bar = progress_bar(len(self.files_list))
        # Print stuff
        sys.stdout.write(p_bar.next())
        sys.stdout.flush()

        for name in self.files_list:
            dataset_link = join(self.path_unpacked, name.split('.')[0], 'dataset.xml')

            try:
                dataset_xml = parse(dataset_link)
            except Exception, err:
                register_exception(alert_admin=True, prefix="Elsevier error reading dataset.xml file.")
                self.logger.error("Error reading dataset.xml file: %s" % (dataset_link,))
                print >> sys.stdout, "\nError reading dataset.xml file: %s" % (dataset_link,)
                continue

            # created = get_value_in_tag(dataset_xml.getElementsByTagName('dataset-unique-ids')[0], 'timestamp')
            journal_items = dataset_xml.getElementsByTagName('journal-item')
            self.logger.info("Getting metadata and fulltex directories for %i journal items." % (len(journal_items),))
            for journal_item in journal_items:
                xml_pathname = join(self.path_unpacked, name.split('.')[0], xml_to_text(journal_item.getElementsByTagName('ml')[0].getElementsByTagName('pathname')[0]))
                pdf_pathname = join(self.path_unpacked, name.split('.')[0], xml_to_text(journal_item.getElementsByTagName('web-pdf')[0].getElementsByTagName('pathname')[0]))
                self.found_articles.append(dict(xml=xml_pathname, pdf=pdf_pathname))
            self.logger.info("Got metadata and fulltex directories of %i journals." % (len(self.found_articles),))
            # Print stuff
            sys.stdout.write(p_bar.next())
            sys.stdout.flush()
Exemplo n.º 2
0
    def _download_file_listing(self):
        if self.files_list:
            # Prints stuff
            print >> sys.stdout, "\nDownloading %i \".ready\" files." % (len(self.files_list))
            # Create progrss bar
            p_bar = progress_bar(len(self.files_list))
            # Print stuff
            sys.stdout.write(p_bar.next())
            sys.stdout.flush()

            for filename in self.files_list:
                self.logger.info("Downloading: %s" % (filename,))
                pkg_path = join(CFG_READY_PACKAGES, filename)
                self.path_r_pkg.append(pkg_path)
                try:
                    ready_file = open(pkg_path, 'wb')
                    self.ftp.retrbinary('RETR %s' % (filename,), ready_file.write)
                    ready_file.close()
                except:
                    self.logger.error("Error downloading file: %s" % (filename,))
                    print >> sys.stdout, "\nError downloading %s file!" % (filename,)
                    print >> sys.stdout, sys.exc_info()
                # Print stuff
                sys.stdout.write(p_bar.next())
                sys.stdout.flush()
            return self.path_r_pkg
        else:
            print >> sys.stdout, "No new packages to download."
            self.logger.info("No new packages to download.")
            raise NoNewFiles
Exemplo n.º 3
0
    def _download_tars(self, check_integrity=True):
        if check_integrity:
            check_pkgs_integrity(self.retrieved_packages, self.logger, self.ftp)

        print >> sys.stdout, "\nDownloading %i tar packages." \
                 % (len(self.retrieved_packages))
        # Create progrss bar
        p_bar = progress_bar(len(self.files_list))
        # Print stuff
        sys.stdout.write(p_bar.next())
        sys.stdout.flush()

        for filename in self.retrieved_packages.iterkeys():
            self.logger.info("Downloading tar package: %s" % (filename,))
            unpack_path = join(CFG_TAR_FILES, filename)
            self.retrieved_packages_unpacked.append(unpack_path)
            try:
                tar_file = open(unpack_path, 'wb')
                self.ftp.retrbinary('RETR %s' % filename, tar_file.write)
                tar_file.close()
            except:
                register_exception(alert_admin=True, prefix="Elsevier package download faild.")
                self.logger.error("Error downloading tar file: %s" % (filename,))
                print >> sys.stdout, "\nError downloading %s file!" % (filename,)
                print >> sys.stdout, sys.exc_info()
            # Print stuff
            sys.stdout.write(p_bar.next())
            sys.stdout.flush()

        return self.retrieved_packages_unpacked
Exemplo n.º 4
0
    def _get_packages(self):
        # Prints stuff
        print >> sys.stdout, "\nRetrieving packages names."
        # Create progrss bar
        p_bar = progress_bar(len(self.files_list))
        # Print stuff
        sys.stdout.write(p_bar.next())
        sys.stdout.flush()

        for pack in self.path_r_pkg:
            self.logger.info("Retrieved package name: %s" % (pack,))
            pack_xml = parse(pack)
            package_file = pack_xml.getElementsByTagName('dataset-package-file')
            for pf in package_file:
                filename = pf.getElementsByTagName('filename')[0]
                md5_val = pf.getElementsByTagName('md5')[0]
                self.retrieved_packages[xml_to_text(filename)] = xml_to_text(md5_val)
             # Print stuff
            sys.stdout.write(p_bar.next())
            sys.stdout.flush()

        return self.retrieved_packages
Exemplo n.º 5
0
    def _download_tars(self, check_integrity=True):
        self.retrieved_packages_unpacked = []
        # Prints stuff
        if self.files_list:
            if check_integrity:
                check_pkgs_integrity(self.files_list, self.logger, self.ftp)

            print >> sys.stdout, "\nDownloading %i tar packages." \
                                 % (len(self.files_list))
            # Create progrss bar
            p_bar = progress_bar(len(self.files_list))
            # Print stuff
            sys.stdout.write(p_bar.next())
            sys.stdout.flush()

            for filename in self.files_list:
                self.logger.info("Downloading tar package: %s" % (filename,))
                unpack_path = join(CFG_TAR_FILES, filename)
                self.retrieved_packages_unpacked.append(unpack_path)
                try:
                    tar_file = open(unpack_path, 'wb')
                    self.ftp.retrbinary('RETR %s' % filename, tar_file.write)
                    tar_file.close()
                except:
                    self.logger.error("Error downloading tar file: %s" % (filename,))
                    print >> sys.stdout, "\nError downloading %s file!" % (filename,)
                    print >> sys.stdout, sys.exc_info()
                # Print stuff
                sys.stdout.write(p_bar.next())
                sys.stdout.flush()

            return self.retrieved_packages_unpacked
        else:
            print >> sys.stdout, "No new packages to download."
            self.logger.info("No new packages to download.")
            raise NoNewFiles