예제 #1
0
 def read_global_site_json(self):
     try:
         self.global_site_urls = adm.read_json(self.CRAWLING_OUTPUT_URLS)
         self.global_site_pages = adm.read_json(self.CRAWLING_OUTPUT_PAGES)
         self.global_site_redirects = adm.read_json(self.CRAWLING_OUTPUT_REDIRECTS)
         self.global_site_error_urls = adm.read_json(self.CRAWLING_OUTPUT_ERROR_URLS)
         self.global_site_ignored_urls = adm.read_json(self.CRAWLING_OUTPUT_IGNORED_URLS)
     except:
         pass
예제 #2
0
def do_content(this_preset_dir, noscan):
    global content
    if role_stats['kalite']['active']:
        content["kalite"] = {'lang_code': 'en', 'topics': []} # defaults

    content_file = this_preset_dir + 'content.json'

    if os.path.exists(content_file):
        old_content = adm.read_json(content_file)
    else:
        old_content = {}

    if noscan:
        content_from_menu(this_preset_dir)
    else:
        content_from_files()

    # read list of maps
    if os.path.exists(map_path):
        excl_maps = ['']
        map_list = os.listdir(map_path)
        for fname in map_list:
            content["maps"].append(fname)

    # preserve any kalite for now
    content["kalite"] = old_content.get("kalite", {})
    if role_stats['kalite']['active']:
        lang = get_kalite_lang()
        content["kalite"]["lang_code"] = lang
        get_kalite_complete('khan/', lang)
        content["kalite"]["topics"] = kalite_topics

    adm.write_json_file(content, content_file)
예제 #3
0
def main():
    global map_catalog
    global base_catalog

    args = parse_args()
    map_id = args.map_id

    catalog = adm.read_json(catalog_path)
    map_catalog = catalog['maps']
    base_catalog = catalog['base']
    #for k in catalog.keys():
      #print(k)

    is_map = map_id in map_catalog
    is_base = map_id in base_catalog

    if not is_base and not is_map:
        print('Download URL not found in Map Catalog: %s'%args.map_id)
        sys.exit(1)

    # create init.json which sets initial coords and zoom
    if is_map:
        init = {}
        map = map_catalog[map_id]
        init['region'] = map['region']
        init['zoom'] = map['zoom']
        init['center_lon'] = map['center_lon']
        init['center_lat'] = map['center_lat']
        init_fn = viewer_path + '/init.json'
        adm.write_json_file(init, init_fn)

    installed_maps = get_installed_tiles()
    print('installed_maps')
    print(repr(installed_maps))
    write_vector_map_idx_v2(installed_maps)
예제 #4
0
def main(argv):
    global site_urls
    global site_pages
    global site_redirects
    global site_ignored_urls
    global site_error_urls

    # Pass in json file name
    if len(sys.argv) > 1:
        site = sys.argv[1]
    else:
        print('usage: site-analzyer.py <site>')
        sys.exit(1)

    url_json_file = site + '_urls.json'
    page_json_file = site + '_pages.json'
    redirects_json_file = site + '_redirects.json'
    ignored_urls_json_file = site + '_ignored_urls.json'
    error_urls_json_file = site + '_error_urls.json'

    try:
        site_urls = adm.read_json(url_json_file)
        site_pages = adm.read_json(page_json_file)
        site_redirects = adm.read_json(redirects_json_file)
        site_error_urls = adm.read_json(ignored_urls_json_file)
        site_ignored_urls = adm.read_json(error_urls_json_file)
    except:
        print('Unable to read one or more site files')
        sys.exit(1)

    calc_page_children()
    compare_urls()  # look for page/url mismatches

    sum_content_types()
    #recursive_visit_extract_urls(channel_dict)

    json_formatted_str = json.dumps(content_types, indent=2)
    print(json_formatted_str)

    for content_type in content_types:
        print(content_type, content_types[content_type]['count'],
              human_readable(content_types[content_type]['bytes']))

    print('Total Site Size: ' + human_readable(total_bytes))
예제 #5
0
def content_from_menu(this_preset_dir):
    global content
    menu = adm.read_json(this_preset_dir + 'menu.json')
    all_menu_defs = adm.get_all_menu_defs()
    for menu_def in menu["menu_items_1"]:
        if menu_def in all_menu_defs:
            if all_menu_defs[menu_def]["intended_use"] == "html":
                 content["modules"].append(all_menu_defs[menu_def]["moddir"])
            elif all_menu_defs[menu_def]["intended_use"] == "zim":
                content["zims"].append(all_menu_defs[menu_def]["zim_name"])
예제 #6
0
def main():
    global verbose
    global download_flag

    oer2go_catalog = {}
    err_num = 0
    err_str = "SUCCESS"

    local_oer2go_catalog = adm.read_json(adm.CONST.oer2go_catalog_file)
    local_oer2go_catalog = local_oer2go_catalog['modules']

    err_num, err_str, oer2go_catalog_v2 = get_oer2go_cat_v2()

    for item in oer2go_catalog_v2:
        if item in local_oer2go_catalog:
            continue
        id = oer2go_catalog_v2[item]['module_id']
        moddir = oer2go_catalog_v2[item]['moddir']
        if id in dup_list:
            continue
        print(id, moddir)
예제 #7
0
import iiab.iiab_lib as iiab
import iiab.adm_lib as adm

try:
    adm.pcgvtd9()
except:
    print("Unable to contact Server")
    sys.exit(1)

# load lang codes
iiab.read_lang_codes()

local_menu_item_defs = adm.get_local_menu_item_defs() # returns dict
menu_def_repo_data = adm.get_menu_def_repo_data() # returns dict
repo_menu_item_defs = menu_def_repo_data['defs']
obsolete_menu_item_defs = adm.read_json(adm.CONST.obsolete_menu_defs)
changes_made = False

# download menu item defs from repo that are not present
for menu_item_def_name in repo_menu_item_defs:
    if menu_item_def_name not in local_menu_item_defs:
        if menu_item_def_name in obsolete_menu_item_defs:
            print('Skipping obsolete menu definition ' + menu_item_def_name)
            continue # don't download obsolete
        menu_item_def = adm.get_menu_item_def_from_repo_by_name(menu_item_def_name)
        adm.write_other_menu_item_def_files(menu_item_def)
        adm.write_menu_item_def(menu_item_def_name, menu_item_def)
        print ('Downloading new remote menu item definition ' + menu_item_def_name)
        changes_made = True
# upload new and changed local menu item defs to repo if upload_flag set
for menu_item_def_name in local_menu_item_defs:
예제 #8
0
def main():

    global verbose
    oer2go_catalog = {}

    args = parse_args()
    if args.verbose:
        verbose = True

    # make sure we have menu js_menu_dir if args.menu true
    if args.menu:
        if not os.path.isdir(adm.CONST.js_menu_dir):
            sys.stdout.write(
                "GET-OER2GO-CAT ERROR - iiab-menu not installed and --menu option given\n"
            )
            sys.stdout.flush()
            sys.exit(99)

    # for now we will assume that old modules are still in the current catalog
    # get new oer2go catalog unless told not to

    if not args.no_download:
        try:
            url_handle = urllib.request.urlopen(adm.CONST.oer2go_cat_url)
            oer2go_catalog_json = url_handle.read()
            url_handle.close()
        except (urllib.error.URLError) as exc:
            sys.stdout.write("GET-OER2GO-CAT ERROR - " + str(exc.reason) +
                             '\n')
            sys.stdout.flush()
            sys.exit(1)
        try:
            url_handle = urllib.request.urlopen(adm.CONST.iiab_module_cat_url)
            iiab_catalog_json = url_handle.read()
            url_handle.close()
        except (urllib.error.URLError) as exc:
            sys.stdout.write("GET-OER2GO-CAT ERROR - " + str(exc.reason) +
                             '\n')
            sys.stdout.flush()
            sys.exit(2)

        # now try to parse
        try:
            oer2go_catalog = json.loads(oer2go_catalog_json)
            iiab_catalog = json.loads(iiab_catalog_json)
        except:
            sys.stdout.write("GET-OER2GO-CAT ERROR - " +
                             str(sys.exc_info()[0]) + "," +
                             str(sys.exc_info()[1]) + '\n')
            sys.stdout.flush()
            sys.exit(3)

        # merge iiab_catalog.json if was downloaded otherwise assume was previously merged
        for item in iiab_catalog:
            moddir = item['moddir']
            id = item['module_id']
            module = item
            iiab_oer2go_catalog[moddir] = module

    else:
        local_oer2go_catalog = adm.read_json(adm.CONST.oer2go_catalog_file)
        oer2go_catalog = local_oer2go_catalog['modules']

    working_dir = adm.CONST.rachel_working_dir + str(uuid.uuid4()) + "/"
    os.mkdir(working_dir)
    #os.mkdir(iiab_menu_download_dir)

    for item in oer2go_catalog:  # structure of local and remote catalogs is different
        if args.no_download:  # local
            moddir = item
            module = oer2go_catalog[moddir]
            module_id = module['module_id']
        else:  # remote
            moddir = item['moddir']
            module_id = item['module_id']
            module = item

        if moddir is None:  # skip items with no moddir
            continue

        menu_item_name = moddir
        if module_id not in dup_list:
            is_downloaded, has_menu_def = adm.get_module_status(module)
            if args.menu and is_downloaded:
                if not has_menu_def:
                    menu_item_name = adm.create_module_menu_def(
                        module, working_dir, incl_extra_html=False)
                    msg = "Generating menu files"
                    if verbose:
                        print("%s %s %s" % (msg, module_id, moddir))
                adm.update_menu_json(
                    menu_item_name)  # only adds if not already in menu
        else:
            msg = "Skipping module not needed by Internet in a Box"
            if verbose:
                print("%s %s %s" % (msg, module_id, moddir))
            continue
        iiab_oer2go_catalog[moddir] = module

    # no need to write catalog if not downloaded as we don't need wip and other extra menu def fields
    if not args.no_download:
        dated_oer2go_cat = {}
        dated_oer2go_cat['download_date'] = time.strftime("%Y-%m-%d.%H:%M:%S")
        dated_oer2go_cat['modules'] = iiab_oer2go_catalog

        with open(adm.CONST.oer2go_catalog_file, 'w') as outfile:
            json.dump(dated_oer2go_cat, outfile, indent=2)

    shutil.rmtree(working_dir)

    sys.stdout.write("SUCCESS")
    sys.stdout.flush()
    sys.exit(0)
예제 #9
0
import re
import string
from urllib.parse import urljoin, urldefrag, urlparse
from bs4 import BeautifulSoup, Comment, SoupStrainer
import iiab.adm_lib as adm

site = 'rarediseases.info.nih.gov'

orig_dir = '/articlelibrary/viewarticle/'
base_url = 'https://' + site + orig_dir
src_dir = 'raw/html'
dst_dir = '/library/www/html/modules/en-nih_rarediseases'

# read urls
url_json_file = site + '_urls.json'
site_urls = adm.read_json(url_json_file)

def main(argv):
    # need site_urls for type of image - see below

    file_list = os.listdir(src_dir)
    #file_list = ['article-17922.html','article-41380.html','article-788.html', 'article-99590.html', 'article-29120.html', 'article-16989.html']
    for filename in file_list:
        print('Converting ' + filename)
        if not filename.endswith(".html"):
            print('Skippinging ' + filename)
            continue
        page = do_page(os.path.join(src_dir, filename))
        html_output = page.encode_contents(formatter='html')

        with open(dst_dir + filename, 'wb') as f:
예제 #10
0
def main ():
    global verbose
    global download_flag

    oer2go_catalog = {}
    err_num = 0
    err_str = "SUCCESS"

    args = parse_args()
    if args.verbose:
        verbose = True
    if args.no_download:
        download_flag = False

    # make sure we have menu js_menu_dir if args.menu true
    if args.menu:
        if not os.path.isdir(adm.CONST.js_menu_dir):
            sys.stdout.write("GET-OER2GO-CAT ERROR - iiab-menu not installed and --menu option given\n")
            sys.stdout.flush()
            sys.exit(99)

    # always get our catalog
    # failure is fatal
    try:
        url_handle = urllib.request.urlopen(adm.CONST.iiab_module_cat_url)
        iiab_catalog_json = url_handle.read()
        url_handle.close()
        iiab_catalog = json.loads(iiab_catalog_json)
    except (urllib.error.URLError) as exc:
        sys.stdout.write("GET-OER2GO-CAT ERROR - " + str(exc.reason) +'\n')
        sys.stdout.flush()
        sys.exit(2)

    # for now we will assume that old modules are still in the current catalog
    # get new oer2go catalog unless told not to

    if download_flag:
        err_num, err_str, oer2go_catalog = get_oer2go_cat()
        if err_num != 0:
            download_flag = False
    if not download_flag: # get local copy
        local_oer2go_catalog = adm.read_json(adm.CONST.oer2go_catalog_file)
        oer2go_catalog = local_oer2go_catalog['modules']

    # start with iiab_catalog.json
    for item in iiab_catalog:
        moddir = item['moddir']
        id = item['module_id']
        module = item
        iiab_oer2go_catalog[moddir] = module

    working_dir = adm.CONST.rachel_working_dir + str(uuid.uuid4()) + "/"
    os.mkdir(working_dir)
    #os.mkdir(iiab_menu_download_dir)

    for item in oer2go_catalog: # structure of local and remote catalogs is different
        if not download_flag: # local
            moddir = item
            module = oer2go_catalog[moddir]
            module_id = module['module_id']
        else: # remote
            moddir = item['moddir']
            module_id = item['module_id']
            module = item

        if moddir is None: # skip items with no moddir
            continue

        menu_item_name = moddir

        if str(module_id) in dup_list:
            msg = "Skipping module not needed by Internet in a Box"
            if verbose:
                print("%s %s %s" % (msg, module_id, moddir))
            continue
        if module.get('type') != 'html':
            continue

        is_downloaded, has_menu_def = adm.get_module_status (module)
        #if args.menu and is_downloaded:
        if args.menu:
            if not has_menu_def:
                menu_item_name = adm.create_module_menu_def(module, working_dir, incl_extra_html = False)
                msg = "Generating menu files"
                if verbose:
                    print("%s %s %s" % (msg, module_id, moddir))
            if is_downloaded:
                adm.update_menu_json(menu_item_name) # only adds if not already in menu

        iiab_oer2go_catalog[moddir] = module

    # write catalog even if not downloaded as our could have changed
    dated_oer2go_cat = {}
    dated_oer2go_cat['download_date'] = time.strftime("%Y-%m-%d.%H:%M:%S")
    dated_oer2go_cat['modules'] = iiab_oer2go_catalog

    adm.write_json_file(dated_oer2go_cat, adm.CONST.oer2go_catalog_file)

    shutil.rmtree(working_dir)

    sys.stdout.write(err_str)
    sys.stdout.flush()
    sys.exit(err_num)
예제 #11
0
################################################################################
site = 'www.ncbi.nlm.nih.gov'
MAIN_SOURCE_DOMAIN = 'https://' + site
START_PAGE = 'https://' + site
SOURCE_DOMAINS = []
IGNORE_URLS = []
crawler = BasicSpider(main_source_domain=MAIN_SOURCE_DOMAIN)
crawler.IGNORE_URLS.extend(IGNORE_URLS)

crawler.set_output_file_names(site)
crawler.pre_crawl_setup()
crawler.read_global_site_json()

crawler.SHORTEN_CRAWL = True
stat_pearl_catalog = {}
stat_pearl_catalog = adm.read_json('stat-pearl-catalog.json')

test_cnt = 5
for article_id in stat_pearl_catalog:
    original_url = MAIN_SOURCE_DOMAIN + stat_pearl_catalog[article_id]['url']
    local_file = original_url.split('://')[1]
    if local_file[-1] == '/':
        local_file = local_file[:-1] + '.html'
    else:
        local_file = local_file + '.html'

    local_dir = os.path.dirname(local_file)
    if not os.path.exists(local_dir):
        os.makedirs(local_dir)

    if os.path.isfile(local_file):
예제 #12
0
################################################################################
site = 'rarediseases.info.nih.gov'
MAIN_SOURCE_DOMAIN = 'https://' + site
START_PAGE = 'https://' + site
SOURCE_DOMAINS = []
IGNORE_URLS = []
crawler = BasicSpider(main_source_domain=MAIN_SOURCE_DOMAIN)
crawler.IGNORE_URLS.extend(IGNORE_URLS)

crawler.set_output_file_names(site)
crawler.pre_crawl_setup()
crawler.read_global_site_json()

crawler.SHORTEN_CRAWL = True
disease_catalog = {}
disease_catalog = adm.read_json('disease-catalog.json')

test_cnt = 5
for disease_url in disease_catalog:
    original_url = MAIN_SOURCE_DOMAIN + disease_url
    local_name = disease_url[1:].replace('/', '.')
    local_file = 'raw/html/' + local_name + '.html'

    local_dir = os.path.dirname(local_file)
    if not os.path.exists(local_dir):
        os.makedirs(local_dir)

    if os.path.isfile(local_file):
        continue
    # print(original_url)
    url, html = crawler.download_page(original_url)