Exemplo n.º 1
0
 def __init__(self, **options):
     super(GlassInfo, self).__init__(**options)
     self.batch_cat = "{}: {}".format(BATCH_CAT, BATCH_DATE)
     self.commons = pywikibot.Site('commons', 'commons')
     self.wikidata = pywikibot.Site('wikidata', 'wikidata')
     self.log = common.LogFile('', LOGFILE)
     self.category_cache = []
 def __init__(self, options):
     """Initialise a harvester object for a DigitaltMuseum harvest."""
     if not os.path.exists(CACHE_DIR):
         os.makedirs(CACHE_DIR)  # Create directory for cache if needed
     self.data = {}  # data container for harvested info
     self.settings = options
     self.log = common.LogFile('', self.settings.get('harvest_log_file'))
     self.log.write_w_timestamp('Harvester started...')
     self.exhibition_cache = {}  # cache for exhibition dimu-code, as it's
Exemplo n.º 3
0
 def __init__(self, **options):
     """Initialise a make_info object."""
     batch_date = options.get('batch_label') or BATCH_DATE
     batch_cat = options.get('base_meta_cat') or BATCH_CAT
     super(KMBInfo, self).__init__(batch_cat, batch_date, **options)
     self.commons = pywikibot.Site('commons', 'commons')
     self.wikidata = pywikibot.Site('wikidata', 'wikidata')
     self.category_cache = {}  # cache for category_exists()
     self.photographer_cache = {}
     self.log = common.LogFile('', LOGFILE)
Exemplo n.º 4
0
    def __init__(self, **options):
        """Initialise a make_info object."""
        batch_date = common.pop(options, 'batch_label') or BATCH_DATE
        batch_cat = common.pop(options, 'base_meta_cat') or BATCH_CAT
        super(SMVKInfo, self).__init__(batch_cat, batch_date, **options)

        self.commons = pywikibot.Site('commons', 'commons')
        self.wikidata = pywikibot.Site('wikidata', 'wikidata')
        self.category_cache = {}  # cache for category_exists()
        self.wikidata_cache = {}  # cache for Wikidata results
        self.log = common.LogFile('', LOGFILE)
        self.log.write_w_timestamp('Make info started...')
        self.pd_year = datetime.now().year - 70
    def __init__(self, **options):
        """Initialise a make_info object."""
        self.b_settings = self.load_batch_settings(options)
        super(GLAMInfo,
              self).__init__(self.b_settings["batch_cat"],
                             self.b_settings["batch_date"], **options)

        self.commons = pywikibot.Site('commons', 'commons')
        self.wikidata = pywikibot.Site('wikidata', 'wikidata')
        self.category_cache = {}  # cache for category_exists()
        self.wikidata_cache = {}  # cache for Wikidata results
        self.log = common.LogFile(
            '', self.b_settings.get("makeinfo_log_file" or LOGFILE))
        self.log.write_w_timestamp('Make info started...')
        self.pd_year = datetime.now().year - 70
Exemplo n.º 6
0
def run(start=None, end=None):
    """Get parsed data for whole kmb hitlist and store as json."""
    log = common.LogFile('', LOGFILE)
    hitlist = load_list()
    if start or end:
        hitlist = hitlist[start:end]
    data = {}
    total_count = len(hitlist)
    for count, kmb in enumerate(hitlist):
        data[kmb] = kmb_wrapper(kmb, log)
        time.sleep(THROTTLE)
        if count % 100 == 0:
            pywikibot.output('{time:s} - {count:d} of {total:d} parsed'.format(
                time=time.strftime('%H:%M:%S'), count=count,
                total=total_count))
    output_blob(data)
    pywikibot.output(log.close_and_confirm())
    def __init__(self, options):
        """Initialise an mapping updater for a SMVK dataset."""
        self.settings = options
        parser = CsvParser(**self.settings)

        self.log = common.LogFile('', self.settings.get('mapping_log_file'))
        self.log.write_w_timestamp('Updater started...')
        self.mappings = load_mappings(
            update_mappings=True,
            mappings_dir=self.settings.get('mappings_dir'))
        data = parser.load_data(self.settings.get('data_file'))
        # load archive card data to ensure formatting is still valid
        archive_data = parser.load_archive_data(
            self.settings.get('archive_file'))

        self.people_to_map = Counter()
        self.ethnic_to_map = Counter()
        self.places_to_map = OrderedDict()
        self.keywords_to_map = Counter()
        self.expedition_to_match = set()
        self.museum_to_match = set()
        self.external_to_parse = set()

        self.parse_data(data)
        self.parse_archive_data(archive_data)

        # validate hard coded mappings
        for ext_id in self.external_to_parse:
            utils.parse_external_id(ext_id)
        for expedition in self.expedition_to_match:
            if expedition not in self.mappings.get('expeditions'):
                pywikibot.warning(
                    '{} must be added to expeditions.json'.format(expedition))
        museum_mapping = self.mappings.get('museums')
        for museum, type in self.museum_to_match:
            if museum not in museum_mapping:
                pywikibot.warning(
                    '{} must be added to museum.json'.format(museum))
            elif type not in museum_mapping.get(museum).get('known_types'):
                pywikibot.warning(
                    'The "{}" type for {} must be added the Wikimedia link '
                    'templates and to museum.json'.format(type, museum))

        self.dump_to_wikifiles()
Exemplo n.º 8
0
    def __init__(self, options):
        """Initialise an mapping updater for a DigitaltMuseum harvest."""
        self.settings = options

        self.log = common.LogFile('', self.settings.get('mapping_log_file'))
        self.log.write_w_timestamp('Updater started...')
        self.mappings = load_mappings(
            update_mappings=True,
            mappings_dir=self.settings.get('mappings_dir'))
        harvest_data = load_harvest_data(self.settings.get('harvest_file'))

        self.kulturnav_hits = load_kulturnav_data()
        self.people_to_map = {}
        self.places_to_map = OrderedDict()
        self.subjects_to_map = Counter()

        self.parse_harvest_data(harvest_data)
        self.check_and_remove_code_place_entries()
        self.dump_to_wikifiles()
Exemplo n.º 9
0
def get_data():
    """Get parsed data for given keywords and store as json files."""
    log = common.LogFile('', LOGFILE)
    settings = load_settings()
    keywords = settings["keywords"]
    api_key = settings["api_key"]
    for keyword in keywords:
        print("[{}] : fetching data.".format(keyword))
        filename = "results_{0}.json".format(keyword)
        results = {}
        hits_limit = 500
        start_at = 1
        counter = 0
        while True:
            url = create_url(keyword, hits_limit, start_at, api_key)
            records = get_records_from_url(url)
            total_results = get_total_hits(records)
            records = split_records(records)
            records_on_page = len(records)
            if records_on_page == 0:
                break
            else:
                for record in records:
                    counter += 1
                    id_no = extract_id_number(record)
                    processed_dict = {'ID': id_no, 'problem': []}
                    processed_record = parse_record(record, processed_dict,
                                                    log)
                    if id_no not in results:
                        results[id_no] = processed_record
                    if counter % 100 == 0:
                        print("Processed {} out of {}".format(
                            counter, total_results))
                start_at += hits_limit
                time.sleep(THROTTLE)
        print("[{}] : fetched {} records to {}.".format(
            keyword, len(results), filename))
        save_data(results, filename)
Exemplo n.º 10
0
def up_all_from_url(info_path,
                    cutoff=None,
                    target='upload_logs',
                    file_exts=None,
                    verbose=False,
                    test=False,
                    target_site=None,
                    only=None,
                    skip=None):
    """
    Upload all images provided as urls in a make_info json file.

    Media (image) files and metadata files with the expected extension .info
    should be in the same directory. Metadata files should contain the entirety
    of the desired description page (in wikitext).

    Outputs separate logfiles for files triggering errors, warnings (and
    successful) so that these can be used in latter runs.

    @param info_path: path to the make_info json file
    @param cutoff: number of files to upload (defaults to all)
    @param target: sub-directory for log files (defaults to "upload_logs")
    @param file_exts: tuple of allowed file extensions (defaults to FILE_EXTS)
    @param verbose: whether to output confirmation after each upload
    @param test: set to True to test but not upload
    @param target_site: pywikibot.Site to which file should be uploaded,
        defaults to Commons.
    @param only: list of urls to upload, if provided all others will be skipped
    @param skip: list of urls to skip, all others will be uploaded
    """
    # set defaults unless overridden
    file_exts = file_exts or FILE_EXTS
    target_site = target_site or pywikibot.Site('commons', 'commons')
    target_site.login()

    # load info file
    info_datas = common.open_and_read_file(info_path, as_json=True)

    # create target directory if it doesn't exist
    output_dir = os.path.join(os.path.dirname(info_path), target)
    common.create_dir(output_dir)

    # create all log files
    logs = {
        'success': common.LogFile(output_dir, 'success.log'),
        'warning': common.LogFile(output_dir, 'warnings.log'),
        'error': common.LogFile(output_dir, 'errors.log'),
        'general': common.LogFile(output_dir, 'uploader.log')
    }

    # shortcut to the general/verbose logfile
    flog = logs['general']

    # filtering based on entries in only/skip
    kill_list = set()
    if only:
        kill_list |= set(info_datas.keys()) - set(only)  # difference
    if skip:
        kill_list |= set(info_datas.keys()) & set(skip)  # intersection
    for key in kill_list:
        del info_datas[key]
    flog.write_w_timestamp('{} files remain to upload after filtering'.format(
        len(info_datas)))

    counter = 1
    for url, data in info_datas.items():
        if cutoff and counter > cutoff:
            break

        # verify that the file extension is ok
        try:
            ext = verify_url_file_extension(url, file_exts)
        except common.MyError as e:
            flog.write_w_timestamp(e)
            continue

        # verify that info and output filenames are provided
        if not data['info']:
            flog.write_w_timestamp(
                '{url}: Found url missing the info field (at least)'.format(
                    url=url))
            continue
        elif not data['filename']:
            flog.write_w_timestamp(
                '{url}: Found url missing the output filename'.format(url=url))
            continue

        # prepare upload
        txt = make_info_page(data)
        filename = '{filename}{ext}'.format(filename=data['filename'], ext=ext)

        if test:
            pywikibot.output(
                'Test upload "{filename}" from "{url}" with the following '
                'description:\n{txt}\n'.format(filename=filename,
                                               url=url,
                                               txt=txt))
            counter += 1
            continue
        # stop here if testing

        result = upload_single_file(filename,
                                    url,
                                    txt,
                                    target_site,
                                    upload_if_badprefix=True)
        if result.get('error'):
            logs['error'].write(url)
        elif result.get('warning'):
            logs['warning'].write(url)
        else:
            logs['success'].write(url)
        if verbose:
            pywikibot.output(result.get('log'))

        flog.write_w_timestamp(result.get('log'))
        counter += 1

    for log in logs.values():
        pywikibot.output(log.close_and_confirm())
Exemplo n.º 11
0
def up_all(in_path,
           cutoff=None,
           target='Uploaded',
           file_exts=None,
           verbose=False,
           test=False,
           target_site=None,
           chunked=True):
    """
    Upload all matched files in the supplied directory.

    Media (image) files and metadata files with the expected extension .info
    should be in the same directory. Metadata files should contain the entirety
    of the desired description page (in wikitext).

    Moves each file to one the target folders after processing.

    @param in_path: path to directory with files to upload
    @param cutoff: number of files to upload (defaults to all)
    @param target: sub-directory for uploaded files (defaults to "Uploaded")
    @param file_exts: tuple of allowed file extensions (defaults to FILE_EXTS)
    @param verbose: whether to output confirmation after each upload
    @param test: set to True to test but not upload
    @param target_site: pywikibot.Site to which file should be uploaded,
        defaults to Commons.
    @param chunked: Whether to do chunked uploading or not.
    """
    # set defaults unless overridden
    file_exts = file_exts or FILE_EXTS
    target_site = target_site or pywikibot.Site('commons', 'commons')
    target_site.login()

    # Verify in_path
    if not os.path.isdir(in_path):
        pywikibot.output('The provided in_path was not a valid '
                         'directory: %s' % in_path)
        exit()

    # create target directories if they don't exist
    done_dir = os.path.join(in_path, target)
    error_dir = '%s_errors' % done_dir
    warnings_dir = '%s_warnings' % done_dir
    common.create_dir(done_dir)
    common.create_dir(error_dir)
    common.create_dir(warnings_dir)

    # logfile
    flog = common.LogFile(in_path, '¤uploader.log')

    # find all content files
    found_files = prepUpload.find_files(path=in_path,
                                        file_exts=file_exts,
                                        subdir=False)
    counter = 1
    for f in found_files:
        if cutoff and counter > cutoff:
            break
        # verify that there is a matching info file
        info_file = '%s.info' % os.path.splitext(f)[0]
        base_name = os.path.basename(f)
        base_info_name = os.path.basename(info_file)
        if not os.path.exists(info_file):
            flog.write_w_timestamp(
                '{0}: Found multimedia file without info'.format(base_name))
            continue

        # prepare upload
        txt = common.open_and_read_file(info_file)

        if test:
            pywikibot.output('Test upload "%s" with the following '
                             'description:\n%s\n' % (base_name, txt))
            counter += 1
            continue
        # stop here if testing

        target_dir = None
        result = upload_single_file(base_name,
                                    f,
                                    txt,
                                    target_site,
                                    upload_if_badprefix=True,
                                    chunked=chunked)
        if result.get('error'):
            target_dir = error_dir
        elif result.get('warning'):
            target_dir = warnings_dir
        else:
            target_dir = done_dir
        if verbose:
            pywikibot.output(result.get('log'))

        flog.write_w_timestamp(result.get('log'))
        os.rename(f, os.path.join(target_dir, base_name))
        os.rename(info_file, os.path.join(target_dir, base_info_name))
        counter += 1

    pywikibot.output(flog.close_and_confirm())