def main(): parser = argparse.ArgumentParser(prog='index2ddg.py') parser.add_argument( 'index', type=str, help='The path to the XML index containing identifier data') parser.add_argument( 'reference', type=str, help='The path to the downloaded reference (reference directory in ' 'the downloaded archive)') parser.add_argument('output', type=str, help='The path to destination output.txt file') parser.add_argument( '--split_code_snippets', action='store_true', default=False, help='Puts each declaration into a separate code snippet.') parser.add_argument( '--max_code_lines', type=int, default=6, help='Maximum number of lines of code to show in abstract') parser.add_argument( '--max_sentences', type=int, default=1, help='Maximum number of sentences to use for the description') parser.add_argument( '--max_characters', type=int, default=200, help='Maximum number of characters to use for the description') parser.add_argument( '--max_paren_chars', type=int, default=40, help='Maximum size of parenthesized text in the description. ' 'Parenthesized chunks longer than that is removed, unless they ' 'are within <code>, <b> or <i> tags') parser.add_argument('--debug', action='store_true', default=False, help='Enables debug mode.') parser.add_argument( '--debug_ident', type=str, default=None, help='Processes only the identifiers that match debug_ident') parser.add_argument( '--debug_abstracts_path', type=str, default=None, help='Path to print the abstracts before newline stripping occurs') args = parser.parse_args() # If a the second argument is 'debug', the program switches to debug mode # and prints everything to stdout. If the third argument is provided, the # program processes only the identifiers that match the provided string debug = DDGDebug(args.debug, args.debug_ident, args.debug_abstracts_path) index_file = args.index output_file = args.output # a map that stores information about location and type of identifiers # it's two level map: full_link maps to a dict that has full_name map to # ITEM_TYPE_* value ident_map = {} # get a list of pages to analyze tr = Index2DuckDuckGoList(ident_map) tr.transform_file(index_file) # get a mapping between titles and pages # linkmap = dict { title -> filename } link_map = build_link_map(args.reference) # create a list of processing instructions for each page proc_ins = get_processing_instructions(ident_map, link_map) # sort proc_ins to produce ordered output.txt proc_ins = [v for v in proc_ins.values()] proc_ins = sorted(proc_ins, key=lambda x: x['link']) for page in proc_ins: idents = page['idents'] idents = [v for v in idents.values()] idents = sorted(idents, key=lambda x: x['ident']) page['idents'] = idents redirects = [] out = open(output_file, 'w', encoding='utf-8') # i=1 for page in proc_ins: idents = page['idents'] link = page['link'] fn = page['fn'] if debug.should_skip_ident([i['ident'] for i in idents]): continue # print(str(i) + '/' + str(len(proc_ins)) + ': ' + link) # i+=1 root = e.parse(os.path.join(args.reference, fn), parser=html.HTMLParser()) for ident in idents: item_ident = ident['ident'] item_type = ident['type'] process_identifier(out, redirects, root, link, item_ident, item_type, args, debug=debug) output_redirects(out, redirects) if debug.enabled: print('=============================') print('Numbers of lines used:') for i, l in enumerate(debug.stat_line_nums): print(str(i) + ': ' + str(l) + ' result(s)')
IndexTransform.process_item_hook(self, el, full_name, full_link) # get a list of pages to analyze tr = Index2DuckDuckGoList() tr.transform(index_file) # get a list of existing pages html_files = [] for root, dirnames, filenames in os.walk('reference'): for filename in fnmatch.filter(filenames, '*.html'): html_files.append(os.path.join(root, filename)) # get a mapping between titles and pages # linkmap = dict { title -> filename } link_map = build_link_map('reference') # create a list of processing instructions for each page proc_ins = {} for link in items: if link in link_map: fn = link_map[link] if fn not in proc_ins: proc_ins[fn] = { 'fn': fn, 'link': link, 'idents': {}} for ident in items[link]: proc_ins[fn]['idents'][ident] = { 'ident' : ident, 'type' : items[link][ident] } # sort proc_ins to produce ordered output.txt proc_ins = [ v for v in proc_ins.values() ]
# get a list of pages to analyze tr = Index2DuckDuckGoList() tr.transform(index_file) # get a list of existing pages html_files = [] for root, dirnames, filenames in os.walk('reference'): for filename in fnmatch.filter(filenames, '*.html'): html_files.append(os.path.join(root, filename)) # get a mapping between titles and pages # linkmap = dict { title -> filename } link_map = build_link_map('reference') # create a list of processing instructions for each page proc_ins = {} for link in items: if link in link_map.mapping: fn = link_map.mapping[link] if fn not in proc_ins: proc_ins[fn] = {'fn': fn, 'link': link, 'idents': {}} for ident in items[link]: proc_ins[fn]['idents'][ident] = { 'ident': ident, 'type': items[link][ident] }
def main(): parser = argparse.ArgumentParser(prog='index2ddg.py') parser.add_argument('index', type=str, help='The path to the XML index containing identifier data') parser.add_argument('reference', type=str, help=('The path to the downloaded reference (reference ' 'directory in the downloaded archive)')) parser.add_argument('output', type=str, help='The path to destination output.txt file') parser.add_argument('--split_code_snippets', action='store_true', default=False, help='Puts each declaration into a separate code snippet.') parser.add_argument('--max_code_lines', type=int, default=6, help='Maximum number of lines of code to show in abstract') parser.add_argument('--max_sentences', type=int, default=1, help='Maximum number of sentences to use for the description') parser.add_argument('--max_characters', type=int, default=200, help='Maximum number of characters to use for the description') parser.add_argument('--max_paren_chars', type=int, default=40, help='Maximum size of parenthesized text in the description. '+ 'Parenthesized chunks longer than that is removed, unless '+ 'they are within <code>, <b> or <i> tags') parser.add_argument('--debug', action='store_true', default=False, help='Enables debug mode.') parser.add_argument('--debug_ident', type=str, default=None, help='Processes only the identifiers that match debug_ident') parser.add_argument('--debug_abstracts_path', type=str, default=None, help='Path to print the abstracts before newline stripping occurs') args = parser.parse_args() # If a the second argument is 'debug', the program switches to debug mode and # prints everything to stdout. If the third argument is provided, the program # processes only the identifiers that match the provided string debug = DDGDebug(args.debug, args.debug_ident, args.debug_abstracts_path) index_file = args.index output_file = args.output # a map that stores information about location and type of identifiers # it's two level map: full_link maps to a dict that has full_name map to # ITEM_TYPE_* value ident_map = {} # get a list of pages to analyze tr = Index2DuckDuckGoList(ident_map) tr.transform(index_file) # get a list of existing pages html_files = get_html_files(args.reference) # get a mapping between titles and pages # linkmap = dict { title -> filename } link_map = build_link_map(args.reference) # create a list of processing instructions for each page proc_ins = get_processing_instructions(ident_map, link_map) # sort proc_ins to produce ordered output.txt proc_ins = [ v for v in proc_ins.values() ] proc_ins = sorted(proc_ins, key=lambda x: x['link']) for page in proc_ins: idents = page['idents'] idents = [ v for v in idents.values() ] idents = sorted(idents, key=lambda x: x['ident']) page['idents'] = idents redirects = [] out = open(output_file, 'w', encoding='utf-8') #i=1 for page in proc_ins: idents = page['idents'] link = page['link'] fn = page['fn'] if debug.should_skip_ident([ i['ident'] for i in idents ]): continue #print(str(i) + '/' + str(len(proc_ins)) + ': ' + link) #i+=1 root = e.parse(os.path.join(args.reference, fn), parser=html.HTMLParser()) for ident in idents: item_ident = ident['ident'] item_type = ident['type'] process_identifier(out, redirects, root, link, item_ident, item_type, args, debug=debug) output_redirects(out, redirects) if debug.enabled: print('=============================') print('Numbers of lines used:') for i,l in enumerate(debug.stat_line_nums): print(str(i) + ': ' + str(l) + ' result(s)')
else: print("Loader file " + fn + " does not match any known files") sys.exit(1) rename_file(root, fn, new_fn) # rename filenames that conflict on case-insensitive filesystems # TODO: perform this automatically rename_file('output/reference/en/cpp/numeric/math', 'NAN.html', 'NAN.2.html') rename_file('output/reference/en/c/numeric/math', 'NAN.html', 'NAN.2.html') # clean FAQ clean_faq('output') # generate link map as long as there is all information present build_link_map() # find files that need to be preprocessed html_files = [] for root, dirnames, filenames in os.walk('output/reference/'): for filename in fnmatch.filter(filenames, '*.html'): html_files.append(os.path.join(root, filename)) #temporary fix # r1 = re.compile('<style[^<]*?<[^<]*?MediaWiki:Geshi\.css[^<]*?<\/style>', re.MULTILINE) # fix links to files in rename_map rlink = re.compile('((?:src|href)=")([^"]*)(")') html_comment = re.compile("<!--(.|\s)*?-->")