Пример #1
0
def search():
	results = []
	query = request.json["query"]  # get search term
	result1 = bs.main(query)
	result2 = bs2.main(query)
	if result1 and not result2:
		results = result1
	elif result2 and not result1:
		results = result2
	elif result1 and result2:
		results = result1 + result2
	search_results = {}
	for i in range(len(results) - 1):
		search_results[str(uuid.uuid4())] = results[i]
	amount = len(search_results)
	return jsonify({"results": search_results, "amount": amount, "keys": list(search_results.keys())})
Пример #2
0
import conf_loader
from bs import main

# by default we connect to localhost:9200
es = Elasticsearch()


def save_es(index, doc_type, profiles):
    """profiles存入本地es

    Args:
        index(str) :- es索引
        doc_type(str) :- es文章类型
        profiles(iterables) :- 存入es的json列表,格式为[dict]
    """
    for id, profile in enumerate(profiles):
        es.index(index=index,
                 doc_type=doc_type,
                 id=id,
                 body=profile)

    res = es.search(index=index, body={"query": {"match_all": {}}})
    for profile in res["hits"]["hits"]:
        print(profile["_source"]["name"])


if __name__ == "__main__":
    profiles = main()
    save_es(index=conf_loader.index, doc_type=conf_loader.doc_type, profiles=profiles)

Пример #3
0
from anolislib import generator, utils

if len(sys.argv)>1 and sys.argv[1] == 'html':
  select = 'w3c-html' 
  spec = 'html'
elif len(sys.argv)>1 and sys.argv[1] == '2dcontext':
  spec = select = '2dcontext'
else:
  sys.stderr.write("Usage: python %s [html|2dcontext]\n" % sys.argv[0])
  exit()

print 'parsing'
os.chdir(os.path.abspath(os.path.join(__file__, '../..')))
source = open('source')
succint = StringIO()
bs.main(source, succint)

succint.seek(0)
filtered = StringIO()
boilerplate.main(succint, filtered, select)
succint.close()

# See http://hg.gsnedders.com/anolis/file/tip/anolis
opts = {
  'allow_duplicate_dfns': True,
  'disable': None,
  'escape_lt_in_attrs': False,
  'escape_rcdata': False,
  'force_html4_id': False,
  'indent_char': u' ',
  'inject_meta_charset': False,
Пример #4
0
def main(spec, spec_dir, branch="master"):
    conf = None
    try:
        conf = config.load_config()[spec]
    except KeyError:
        invoked_incorrectly()

    if 'select' in conf:
        select = conf['select']
    else:
        select = spec

    try:
        if not spec_dir:
            spec_dir = os.path.join(conf["output"], spec)
    except KeyError:
        sys.stderr.write("error: Must specify output directory for %s! \
Check default-config.json.\n" % spec)
        exit()

    cur_dir = os.path.abspath(os.path.dirname(__file__))
    os.chdir(conf["path"])

    print "parsing"
    source = open('source')
    after_microsyntax = StringIO()
    parser_microsyntax.main(source, after_microsyntax)
    after_microsyntax.seek(0)
    succint = StringIO()
    bs.main(after_microsyntax, succint)

    succint.seek(0)
    filtered = StringIO()
    try:
        boilerplate.main(succint, filtered, select, branch)
    except IOError:
        sys.stderr.write("error: Problem loading boilerplate for %s. \
Are you on the correct branch?\n" % spec)
        exit()
    succint.close()

    # See http://hg.gsnedders.com/anolis/file/tip/anolis
    opts = {
      'allow_duplicate_dfns': True,
      'disable': None,
      'escape_lt_in_attrs': False,
      'escape_rcdata': False,
      'force_html4_id': False,
      'indent_char': u' ',
      'inject_meta_charset': False,
      'max_depth': 6,
      'min_depth': 2,
      'minimize_boolean_attributes': False,
      'newline_char': u'\n',
      'omit_optional_tags': False,
      'output_encoding': 'utf-8',
      'parser': 'html5lib',
      'processes': set(['toc', 'xref', 'sub']),
      'profile': False,
      'quote_attr_values': True,
      'serializer': 'html5lib',
      'space_before_trailing_solidus': False,
      'strip_whitespace': None,
      'use_best_quote_char': False,
      'use_trailing_solidus': False,
      'w3c_compat_class_toc': False,
      'w3c_compat_crazy_substitutions': False,
      'w3c_compat_substitutions': False,
      'w3c_compat': True,
      'w3c_compat_xref_a_placement': False,
      'w3c_compat_xref_elements': False,
      'w3c_compat_xref_normalization': False,
    }
    if "anolis" in conf:
        opts.update(conf["anolis"])

    if spec == "srcset":
        import html5lib

        print 'munging (before anolis)'

        filtered.seek(0)
        pre_anolis_buffer = StringIO()

        # Parse
        parser = html5lib.html5parser.HTMLParser(tree = html5lib.treebuilders.getTreeBuilder('lxml'))
        tree = parser.parse(filtered, encoding='utf-8')

        # Move introduction above conformance requirements
        introduction = tree.findall("//*[@id='introduction']")[0]
        intro_ps = introduction.xpath("following-sibling::*")
        target = tree.findall("//*[@id='conformance-requirements']")[0]
        target.addprevious(introduction)
        target = introduction
        target.addnext(intro_ps[2])
        target.addnext(intro_ps[1])
        target.addnext(intro_ps[0])

        # Serialize
        tokens = html5lib.treewalkers.getTreeWalker('lxml')(tree)
        serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True, inject_meta_charset=False)
        for text in serializer.serialize(tokens, encoding='utf-8'):
            pre_anolis_buffer.write(text)

        filtered = pre_anolis_buffer

    print 'indexing'
    filtered.seek(0)
    tree = generator.fromFile(filtered, **opts)
    filtered.close()

    # fixup nested dd's and dt's produced by lxml
    for dd in tree.findall('//dd/dd'):
        if list(dd) or dd.text.strip():
            dd.getparent().addnext(dd)
        else:
            dd.getparent().remove(dd)
    for dt in tree.findall('//dt/dt'):
        if list(dt) or dt.text.strip():
            dt.getparent().addnext(dt)
        else:
            dt.getparent().remove(dt)

    if spec == "microdata":
        print 'munging'
        import lxml
        # get the h3 for the misplaced section (it has no container)
        section = tree.xpath("//h3[@id = 'htmlpropertiescollection']")[0]
        # then get all of its following siblings that have the h2 for the next section as 
        # a following sibling themselves. Yeah, XPath doesn't suck.
        section_content = section.xpath("following-sibling::*[following-sibling::h2[@id='introduction']]")
        target = tree.xpath("//h2[@id = 'converting-html-to-other-formats']")[0].getparent()
        target.addprevious(section)
        for el in section_content: target.addprevious(el)
        section.xpath("span")[0].text = "6.1 "
        # move the toc as well
        link = tree.xpath("//ol[@class='toc']//a[@href='#htmlpropertiescollection']")[0]
        link.xpath("span")[0].text = "6.1 "
        tree.xpath("//ol[@class='toc']/li[a[@href='#microdata-dom-api']]")[0].append(link.getparent().getparent())

    if spec == "srcset":
        print 'munging (after anolis)'
        # In the WHATWG spec, srcset="" is simply an aspect of
        # HTMLImageElement and not a separate feature. In order to keep
        # the HTML WG's srcset="" spec organized, we have to move some
        # things around in the final document.

        # Move "The srcset IDL attribute must reflect..."
        reflect_the_content_attribute = tree.findall("//div[@class='impl']")[0]
        target = tree.find("//div[@class='note']")
        target.addprevious(reflect_the_content_attribute)

        # Move "The IDL attribute complete must return true..."
        note_about_complete = tree.findall("//p[@class='note']")[5]
        p_otherwise = note_about_complete.xpath("preceding-sibling::p[position()=1]")[0]
        ul_conditions = p_otherwise.xpath("preceding-sibling::ul[position()=1]")[0]
        p_start = ul_conditions.xpath("preceding-sibling::p[position()=1]")[0]
        target.addnext(note_about_complete)
        target.addnext(p_otherwise)
        target.addnext(ul_conditions)
        target.addnext(p_start)

    try:
        os.makedirs(spec_dir)
    except:
        pass

    if spec == 'html':
        print 'cleaning'
        from glob import glob
        for name in glob("%s/*.html" % spec_dir):
            os.remove(name)

        output = StringIO()
    else:
        output = open("%s/Overview.html" % spec_dir, 'wb')

    generator.toFile(tree, output, **opts)

    if spec != 'html':
        output.close()
    else:
        value = output.getvalue()
        if "<!--INTERFACES-->\n" in value:
            print 'interfaces'
            from interface_index import interface_index
            output.seek(0)
            index = StringIO()
            interface_index(output, index)
            value = value.replace("<!--INTERFACES-->\n", index.getvalue(), 1)
            index.close()
        output = open("%s/single-page.html" % spec_dir, 'wb')
        output.write(value)
        output.close()
        value = ''

        print 'splitting'
        import spec_splitter
        spec_splitter.w3c = True
        spec_splitter.no_split_exceptions = conf.get("no_split_exceptions", False)
        spec_splitter.minimal_split_exceptions = conf.get("minimal_split_exceptions", False)
        spec_splitter.main("%s/single-page.html" % spec_dir, spec_dir)

        print 'entities'
        entities = open(os.path.join(cur_dir, "boilerplate/entities.inc"))
        json = open("%s/entities.json" % spec_dir, 'w')
        from entity_processor_json import entity_processor_json
        entity_processor_json(entities, json)
        entities.close()
        json.close()

    # copying dependencies
    def copy_dependencies (targets):
        import types
        if not isinstance(targets, types.ListType): targets = [targets]
        for target in targets:
            os.system("/bin/csh -i -c '/bin/cp -R %s %s'" % (os.path.join(conf["path"], target), spec_dir))

    print "copying"
    if spec == "html":
        copy_dependencies(["images", "fonts", "404/*", "switcher", "js"])
    elif spec == "2dcontext":
        copy_dependencies(["images", "fonts"])
    else:
        copy_dependencies("fonts")

    # fix the styling of the 404
    if spec == "html":
        link = tree.xpath("//link[starts-with(@href, 'http://www.w3.org/StyleSheets/TR/')]")[0].get("href")
        path = os.path.join(spec_dir, "404.html")
        with open(path) as data: html404 = data.read()
        html404 = html404.replace("http://www.w3.org/StyleSheets/TR/W3C-ED", link)
        with open(path, "w") as data: data.write(html404)
Пример #5
0
def main(spec, spec_dir, branch="master"):
    conf = None
    try:
        conf = config.load_config()[spec]
    except KeyError:
        invoked_incorrectly()

    if "select" in conf:
        select = conf["select"]
    else:
        select = spec

    try:
        if not spec_dir:
            if conf.get("bareOutput", False):
                spec_dir = conf["output"]
            else:
                spec_dir = os.path.join(conf["output"], spec)
    except KeyError:
        sys.stderr.write(
            "error: Must specify output directory for %s! \
Check default-config.json.\n"
            % spec
        )
        exit()

    cur_dir = os.path.abspath(os.path.dirname(__file__))
    os.chdir(conf["path"])

    print "parsing"
    source = open("source")
    after_microsyntax = StringIO()
    parser_microsyntax.main(source, after_microsyntax)
    after_microsyntax.seek(0)
    succint = StringIO()
    bs.main(after_microsyntax, succint)

    succint.seek(0)
    filtered = StringIO()
    if spec == "microdata":
        md_content = succint.read()
        md_content = re.sub(
            '<h2 id="iana">IANA considerations</h2>',
            '<!--BOILERPLATE microdata-extra-section--><h2 id="iana">IANA considerations</h2>',
            md_content,
        )
        succint = StringIO()
        succint.write(md_content)
        succint.seek(0)

    try:
        boilerplate.main(succint, filtered, select, branch)
    except IOError:
        sys.stderr.write(
            "error: Problem loading boilerplate for %s. \
Are you on the correct branch?\n"
            % spec
        )
        exit()
    succint.close()

    # See http://hg.gsnedders.com/anolis/file/tip/anolis
    opts = {
        "allow_duplicate_dfns": True,
        "disable": None,
        "escape_lt_in_attrs": False,
        "escape_rcdata": False,
        "force_html4_id": False,
        "indent_char": u" ",
        "inject_meta_charset": False,
        "max_depth": 6,
        "min_depth": 2,
        "minimize_boolean_attributes": False,
        "newline_char": u"\n",
        "omit_optional_tags": False,
        "output_encoding": "utf-8",
        "parser": "html5lib",
        "processes": set(["toc", "xref", "sub"]),
        "profile": False,
        "quote_attr_values": True,
        "serializer": "html5lib",
        "space_before_trailing_solidus": False,
        "strip_whitespace": None,
        "use_best_quote_char": False,
        "use_trailing_solidus": False,
        "w3c_compat_class_toc": False,
        "w3c_compat_crazy_substitutions": False,
        "w3c_compat_substitutions": False,
        "w3c_compat": True,
        "w3c_compat_xref_a_placement": False,
        "w3c_compat_xref_elements": False,
        "w3c_compat_xref_normalization": False,
    }
    if "anolis" in conf:
        opts.update(conf["anolis"])

    if spec == "srcset":
        print "munging (before anolis)"

        filtered.seek(0)
        pre_anolis_buffer = StringIO()

        # Parse
        parser = html5lib.html5parser.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"))
        tree = parser.parse(filtered, encoding="utf-8")

        # Move introduction above conformance requirements
        introduction = tree.findall("//*[@id='introduction']")[0]
        intro_ps = introduction.xpath("following-sibling::*")
        target = tree.findall("//*[@id='conformance-requirements']")[0]
        target.addprevious(introduction)
        target = introduction
        target.addnext(intro_ps[2])
        target.addnext(intro_ps[1])
        target.addnext(intro_ps[0])

        # Serialize
        tokens = html5lib.treewalkers.getTreeWalker("lxml")(tree)
        serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True, inject_meta_charset=False)
        for text in serializer.serialize(tokens, encoding="utf-8"):
            pre_anolis_buffer.write(text)

        filtered = pre_anolis_buffer

    # replace data-x with data-anolis-xref
    print "fixing xrefs"
    filtered.seek(0)

    # Parse
    builder = treebuilders.getTreeBuilder("lxml", etree)
    try:
        parser = html5lib.HTMLParser(tree=builder, namespaceHTMLElements=False)
    except TypeError:
        parser = html5lib.HTMLParser(tree=builder)
    tree = parser.parse(filtered, encoding="utf-8")

    # Move introduction above conformance requirements
    data_x = tree.findall("//*[@data-x]")
    non_alphanumeric_spaces = re.compile(r"[^a-zA-Z0-9 \-\_\/\|]+")
    for refel in data_x:
        refel.attrib["data-anolis-xref"] = refel.get("data-x")
        if refel.tag == "dfn" and not refel.get("id", False) and refel.attrib["data-anolis-xref"]:
            refel.attrib["id"] = generateID(refel.attrib["data-anolis-xref"], refel)
        del refel.attrib["data-x"]
    # utils.ids = {}

    print "indexing"
    # filtered.seek(0)
    # tree = generator.fromFile(filtered, **opts)
    generator.process(tree, **opts)
    filtered.close()

    # fixup nested dd's and dt's produced by lxml
    for dd in tree.findall("//dd/dd"):
        if list(dd) or dd.text.strip():
            dd.getparent().addnext(dd)
        else:
            dd.getparent().remove(dd)
    for dt in tree.findall("//dt/dt"):
        if list(dt) or dt.text.strip():
            dt.getparent().addnext(dt)
        else:
            dt.getparent().remove(dt)

    # remove unused references
    print "processing references"
    for dt in tree.findall("//dt[@id]"):
        refID = dt.get("id")
        if refID.startswith("refs") and len(tree.findall("//a[@href='#%s']" % refID)) == 0:
            next = dt.getnext()
            while next.tag != "dd":
                next = next.getnext()
            dt.getparent().remove(next)
            dt.getparent().remove(dt)
        elif refID.startswith("refs"):
            dd = dt.getnext()
            while dd.tag != "dd":
                dd = dd.getnext()
            links = dd.findall(".//a[@href]")
            for link in links:
                if link is not None:
                    wrap = link.getparent()
                    link.tail = " (URL: "
                    idx = wrap.index(link)
                    url = etree.Element("a", href=link.get("href"))
                    url.text = link.get("href")
                    wrap.insert(idx + 1, url)
                    url.tail = ")"

    if spec == "microdata":
        print "munging (after anolis)"
        # get the h3 for the misplaced section (it has no container)
        section = tree.xpath("//h3[@id = 'htmlpropertiescollection']")[0]
        # then get all of its following siblings that have the h2 for the next section as
        # a following sibling themselves. Yeah, XPath doesn't suck.
        section_content = section.xpath("following-sibling::*[following-sibling::h2[@id='introduction']]")
        target = tree.xpath("//h2[@id = 'converting-html-to-other-formats']")[0].getparent()
        target.addprevious(section)
        for el in section_content:
            target.addprevious(el)
        section.xpath("span")[0].text = "6.1 "
        # move the toc as well
        link = tree.xpath("//ol[@class='toc']//a[@href='#htmlpropertiescollection']")[0]
        link.xpath("span")[0].text = "6.1 "
        tree.xpath("//ol[@class='toc']/li[a[@href='#microdata-dom-api']]")[0].append(link.getparent().getparent())

    if spec == "srcset":
        print "munging (after anolis)"
        # In the WHATWG spec, srcset="" is simply an aspect of
        # HTMLImageElement and not a separate feature. In order to keep
        # the HTML WG's srcset="" spec organized, we have to move some
        # things around in the final document.

        # Move "The srcset IDL attribute must reflect..."
        reflect_the_content_attribute = tree.findall("//div[@class='impl']")[0]
        target = tree.find("//div[@class='note']")
        target.addprevious(reflect_the_content_attribute)

        # Move "The IDL attribute complete must return true..."
        note_about_complete = tree.findall("//p[@class='note']")[4]
        p_otherwise = note_about_complete.xpath("preceding-sibling::p[position()=1]")[0]
        ul_conditions = p_otherwise.xpath("preceding-sibling::ul[position()=1]")[0]
        p_start = ul_conditions.xpath("preceding-sibling::p[position()=1]")[0]
        target.addnext(note_about_complete)
        target.addnext(p_otherwise)
        target.addnext(ul_conditions)
        target.addnext(p_start)

    try:
        os.makedirs(spec_dir)
    except:
        pass

    if spec == "html":
        print "cleaning"
        from glob import glob

        for name in glob("%s/*.html" % spec_dir):
            os.remove(name)

        output = StringIO()
    else:
        output = open("%s/Overview.html" % spec_dir, "wb")

    generator.toFile(tree, output, **opts)

    if spec != "html":
        output.close()
    else:
        value = output.getvalue()
        if "<!--INTERFACES-->\n" in value:
            print "interfaces"
            from interface_index import interface_index

            output.seek(0)
            index = StringIO()
            interface_index(output, index)
            value = value.replace("<!--INTERFACES-->\n", index.getvalue(), 1)
            index.close()
        output = open("%s/single-page.html" % spec_dir, "wb")
        output.write(value)
        output.close()
        value = ""

        print "splitting"
        import spec_splitter

        spec_splitter.w3c = True
        spec_splitter.no_split_exceptions = conf.get("no_split_exceptions", False)
        spec_splitter.minimal_split_exceptions = conf.get("minimal_split_exceptions", False)
        spec_splitter.main("%s/single-page.html" % spec_dir, spec_dir)

        print "entities"
        entities = open(os.path.join(cur_dir, "boilerplate/entities.inc"))
        json = open("%s/entities.json" % spec_dir, "w")
        from entity_processor_json import entity_processor_json

        entity_processor_json(entities, json)
        entities.close()
        json.close()

    # copying dependencies
    def copy_dependencies(targets):
        import types

        if not isinstance(targets, types.ListType):
            targets = [targets]
        if os.name == "nt":
            for target in targets:
                os.system("xcopy /s %s %s" % (os.path.join(conf["path"], target), spec_dir))
        else:
            for target in targets:
                os.system("/bin/csh -i -c '/bin/cp -R %s %s'" % (os.path.join(conf["path"], target), spec_dir))

    print "copying"
    if spec == "html":
        if os.name == "nt":
            dirs = ["images", "fonts", "404", "switcher", "js"]
        else:
            dirs = ["images", "fonts", "404/*", "switcher", "js"]
        copy_dependencies(dirs)
    elif spec == "2dcontext":
        copy_dependencies(["images", "fonts"])
    else:
        copy_dependencies("fonts")

    # fix the styling of the 404
    if spec == "html":
        link = tree.xpath("//link[starts-with(@href, 'http://www.w3.org/StyleSheets/TR/')]")[0].get("href")
        path = os.path.join(spec_dir, "404.html")
        with open(path) as data:
            html404 = data.read()
        html404 = html404.replace("http://www.w3.org/StyleSheets/TR/W3C-ED", link)
        with open(path, "w") as data:
            data.write(html404)