def getFromDB(blog_id, comment_id=None, conn = sqlite3.connect(DB_FILE)):
        '''
            Get a Blog object from the database.

            Parameters
            ====================================

            blog_id     `int|list[int]` - The blog_id to retrieve.
            comment_id  `int` - The comment_id to retrieve.
            conn    `sqlite3.Connection` - A SQLite connection object. Default as the a new connection to the global DB_FILE databse file.

            Returns
            ====================================

            `Comment|list(Comment)`  - a Comment object; a list of comment is returned if no comment_id is given.
        '''
        cur = conn.cursor()
        blogClause = ("blog_id IN (" + ",".join(blog_id) + ")") if isinstance(blog_id, list) else ("blog_id = " + str(blog_id))
        if comment_id is None:
            cur.execute("SELECT * FROM comments WHERE " + blogClause)
            commentRows = cur.fetchall()
            return [Comment(*c) for c in commentRows]
        else:
            cur.execute("SELECT * FROM comments WHERE " + blogClause + " AND comment_id = " + str(comment_id))
            commentRow = cur.fetchone()
            return Comment(*commentRow)
示例#2
0
def latexify(soup):
    for item in tagspec:
        for tag in soup.find_all(name=item[0]):
            if item[1]:
                tag.insert_before(Comment(item[1]))
            if item[2]:
                tag.insert_after(Comment(item[2]))
示例#3
0
def add_image_map(tag, siz):
    map_tag = tag.find_previous_sibling('map')
    if not map_tag:
        return tag

    #Mostly the scale is half width, as we have two columns.
    scale = 0.5
    #Sometimes that does not fit, so we keep reducing.
    while (siz[0] * scale) > 280:
        scale *= 0.8
    #Some images are in one column sections, so twice the size
    if tag.find_parent("div", class_="full-width"):
        print("Found a BIG image")
        scale *= 2
    #image may need to be on a new page.
    preamble = '\n\n\\par\\Needspace{' + str(
        30 + siz[1] * scale) + 'pt}\\begin{picture}(' + str(
            siz[0] * scale) + ',' + str(siz[1] * scale) + ')\n'
    postamble = ''
    for area in map_tag.find_all(name='area'):
        if area.has_attr('coords') and area.has_attr('href') and area.has_attr(
                'shape'):
            if (area['shape'] == 'rect'):
                label = label_of_ref(area['href'])
                coords = area['coords'].split(',')
                #Calculations with w and h are because the HTML hotspots have
                #y down the page, and LaTeX ones go up the page.
                #And we have to play games with strings, ints and scaling.
                coords = [int(x) * scale
                          for x in coords]  # convert to numbers.
                # print( 'coord ',coords )
                x, y, x1, y1 = coords
                w = x1 - x
                y = siz[1] * scale - y
                y1 = siz[1] * scale - y1
                h = y - y1
                # Do not include the rather wide hotspots.
                # Each rectangle is 'put' into the picture.
                if (w < (520 * scale)):
                    x, y1, w, h = [str(k) for k in [x, y1, w, h]]
                    postamble += '   \\put(' + x + ',' + y1 + '){\\hyperref[\\foo{' + label + '}]{\\makebox(' + w + ',' + h + '){}}}\n'
    postamble += '\\end{picture}\n\n'
    tag.insert_before(Comment(preamble))
    tag.insert_after(Comment(postamble))
    #The image itself, using put, is before the puts for the areas.
    tag.insert_before(
        Comment('   \\put(0,0){\\includegraphics[scale=' + str(scale) + ']{'))
    tag.insert_after(Comment('}}\n'))
    return tag
def patch(filepath):
    if ("php" in filepath):
        patch_php(filepath)
        return 0
    try:
        with open(filepath) as inf:
            txt = inf.read()
            # soup = BeautifulSoup(txt, 'html.parser')
            soup = BeautifulSoup(txt, "html5lib")
        mydiv = soup.head.find('script', {'class': 'gtm'})
        if not mydiv:
            scrTag = Tag(soup, name='script')
            scrTag['class'] = "gtm"
            scrTag.string = headSnippet
            soup.head.insert(0, Comment('End Google Tag Manager'))
            soup.head.insert(0, scrTag)
            soup.head.insert(0, Comment('Google Tag Manager'))
            #scrTag.insert_before(Comment('Google Tag Manager'))
            #scrTag.insert_after(Comment('End Google Tag Manager'))

            # insert body snippet into the document
            iframeTag = Tag(soup, name='iframe')
            iframeTag['src'] = iframeSrc
            iframeTag['height'] = "0"
            iframeTag['width'] = "0"
            iframeTag['style'] = "display:none;visibility:hidden"

            noscrTag = Tag(soup, name='noscript')
            noscrTag['class'] = 'gtm'
            noscrTag.insert(0, iframeTag)
            soup.body.insert(0, Comment('End Google Tag Manager (noscript)'))
            soup.body.insert(0, noscrTag)
            soup.body.insert(0, Comment('Google Tag Manager (noscript)'))
            #noscrTag.insert_before(Comment('Google Tag Manager (noscript)'))
            #noscrTag.insert_after(Comment('End Google Tag Manager (noscript)'))

        # save the file again
        with open(filepath, 'w') as outf:
            outf.write(str(soup))

    except IOError as e:
        print "I/O error({0}): {1}".format(e.errno, e.strerror)
        return -1
    except:
        print "Unexpected error:", sys.exc_info()[0]
        return -2
    print "Analytics Patched Successfully"
    return 0
示例#5
0
def createJSONfile(url, soupObject):
    '''
	Creates a JSON file writing in it the string from the soupObject.
	'''

    my_title = getTitle(soupObject)
    my_time = getTime()

    #Adding comments to the BeautifulSoup object.
    tag = soupObject.html
    new_comment = Comment('\nWebpage title: ' + my_title + ';\n' +
                          'Webpage extracted from: ' + url + ';\n' +
                          'Webpage time extraction: ' + my_time + ';\n\n')
    tag.insert_before(new_comment)

    #Make the soup object readable.
    p_soup_html = soupObject.prettify()

    #Convert into a JSON.
    y = json.dumps(p_soup_html)

    with open('%s.json' % my_title, 'w', encoding='utf-8') as file:

        file.writelines(y)
        file.close()
    def getOtherComments(self, retreiveCount = 5, blog_ids = None):
        '''
            Get a list of comments not in this blog.

            Parameters
            ====================================

            retreiveCount   `int` - The topmost k comments, recommend a small integer smaller than 30
            blog_ids  `list[int]` - A pre-fetched blog_id list

            Returns
            ====================================

            `list(Comment)`  - a list of Comment objects.
        '''
        cur = self.conn.cursor()
        if blog_ids is None:
            blog_ids = Blog.getIDs(self.conn)
        
        blog_ids = random.sample(blog_ids, retreiveCount+1)

        if (self.blog_id in blog_ids):
            del blog_ids[blog_ids.index(self.blog_id)]

        cur.execute("SELECT * FROM comments WHERE blog_id IN (" + ",".join([str(bi) for bi in blog_ids]) + ") ORDER BY RANDOM() LIMIT " + str(retreiveCount))
        comments = cur.fetchall()
        return [Comment(*c) for c in comments]
    def getCommentsFromSimilarBlogs(self, conn2 = sqlite3.connect(DB_FILE2), topK = 10, retreiveCount = 5, orderedBy = "random", cachedWordList = None, logKeywords = False, printBlogTitles=False):
        '''
            Get a list of comments with the same tfidf as this blog.

            Parameters
            ====================================

            conn2  `sqlite3.Connection` - A SQLite connection object for the word dictionary. Default as the a new connection to the global DB_FILE2 databse file.
            topK            `int`       - The top-K tf-idf words to be selected for comparisons.
            retreiveCount   `None|int` - The topmost k comments, all if None is given.

            Returns
            ====================================

            `list(Comment)`  - a list of Comment objects.

        '''
        cur = self.conn.cursor()

        # Select a list of comments.
        similarBlogs = self.getSimilarBlogs(conn2, topK, retreiveCount, orderedBy=orderedBy, logKeywords=logKeywords, cachedWordList=cachedWordList)

        if (printBlogTitles):
            print([str(b.title) for b in similarBlogs])
        cur.execute("SELECT * FROM comments WHERE blog_id IN (" + ",".join([str(b.blog_id) for b in similarBlogs]) + ") ORDER BY RANDOM()" + ("" if retreiveCount is None else (" LIMIT " + str(retreiveCount))))
        return [Comment(*c) for c in cur.fetchall()]
示例#8
0
    def response(self, flow: http.HTTPFlow):
        response = flow.response
        if CONTENT_TYPE in response.headers:
            if any(
                    map(lambda t: t in response.headers[CONTENT_TYPE],
                        RELEVANT_CONTENT_TYPES)):
                # Response is a web page; proceed.
                insertedScripts: List[str] = []
                soup = BeautifulSoup(response.content,
                                     HTML_PARSER,
                                     from_encoding=inferEncoding(response))
                requestURL = flow.request.pretty_url  # should work in transparent mode too, unless the Host header is spoofed
                isApplicable: Callable[[Userscript],
                                       bool] = userscript.applicableChecker(
                                           requestURL)
                for script in self.userscripts:
                    if isApplicable(script):
                        useInline = ctx.options.inline or script.downloadURL is None
                        if useInline and len(script.unsafeSequences) > 0:
                            logError(unsafeSequencesMessage(script))
                            continue
                        logInfo(
                            f"""Injecting {script.name}{"" if script.version is None else " " + VERSION_PREFIX + script.version} into {requestURL} ({"inline" if useInline else "linked"}) ..."""
                        )
                        result = inject(
                            script, soup,
                            Options(
                                inline=ctx.options.inline,
                                verbose=ctx.options.verbose,
                            ))
                        if type(result) is BeautifulSoup:
                            soup = result
                            insertedScripts.append(script.name + (
                                "" if script.version is None else " " +
                                stringifyVersion(script.version)))
                        else:
                            logError(
                                "Injection failed due to the following error:")
                            logError(str(result))

                index_DTD: Optional[int] = indexOfDTD(soup)
                # Insert information comment:
                if ctx.options.verbose:
                    soup.insert(
                        0 if index_DTD is None else 1 + index_DTD,
                        Comment(INFO_COMMENT_PREFIX +
                                ("No matching userscripts for this URL."
                                 if insertedScripts ==
                                 [] else "These scripts were inserted:\n" +
                                 bulletList(insertedScripts)) + "\n"))
                # Prevent BS/html.parser from emitting `<!DOCTYPE doctype html>` or similar if "DOCTYPE" is not all uppercase in source HTML:
                if index_DTD is not None and REGEX_DOCTYPE.match(
                        soup.contents[index_DTD]):
                    # There is a DTD and it is invalid, so replace it.
                    soup.contents[index_DTD] = Doctype(
                        re.sub(REGEX_DOCTYPE, "", soup.contents[index_DTD]))
                # Serialize and encode:
                response.content = str(soup).encode(
                    fromOptional(soup.original_encoding, CHARSET_DEFAULT),
                    "replace")
示例#9
0
    def response(self, flow: http.HTTPFlow):
        response = flow.response
        if CONTENT_TYPE in response.headers:
            if any(
                    map(lambda t: t in response.headers[CONTENT_TYPE],
                        RELEVANT_CONTENT_TYPES)):
                # Response is a web page; proceed.
                insertedScripts: List[str] = []
                soup = BeautifulSoup(response.content,
                                     HTML_PARSER,
                                     from_encoding=inferEncoding(response))
                requestURL = flow.request.pretty_url  # should work in transparent mode too, unless the Host header is spoofed
                if requestContainsQueryParam(
                        option(T.option_query_param_to_disable), flow.request):
                    logInfo(
                        f"""Not injecting any userscripts into {requestURL} because it contains a `{option(T.option_query_param_to_disable)}` query parameter."""
                    )
                    return
                isApplicable: Callable[[Userscript],
                                       bool] = userscript.applicableChecker(
                                           requestURL)
                for script in self.userscripts:
                    if isApplicable(script):
                        useInline = option(
                            T.option_inline) or script.downloadURL is None
                        if useInline and len(script.unsafeSequences) > 0:
                            logError(unsafeSequencesMessage(script))
                            continue
                        logInfo(
                            f"""Injecting {script.name}{"" if script.version is None else " " + VERSION_PREFIX + script.version} into {requestURL} ({"inline" if useInline else "linked"}) ..."""
                        )
                        result = inject(
                            script, soup,
                            Options(inline=option(T.option_inline), ))
                        if type(result) is BeautifulSoup:
                            soup = result
                            insertedScripts.append(script.name + (
                                "" if script.version is None else " " +
                                T.stringifyVersion(script.version)))
                        else:
                            logError(
                                "Injection failed due to the following error:")
                            logError(str(result))

                index_DTD: Optional[int] = indexOfDTD(soup)
                # Insert information comment:
                if option(T.option_list_injected):
                    soup.insert(
                        0 if index_DTD is None else 1 + index_DTD,
                        Comment(HTML_INFO_COMMENT_PREFIX +
                                ("No matching userscripts for this URL."
                                 if insertedScripts ==
                                 [] else "These scripts were inserted:\n" +
                                 bulletList(insertedScripts)) + "\n"))
                # Serialize and encode:
                response.content = str(soup).encode(
                    fromOptional(soup.original_encoding, CHARSET_DEFAULT),
                    "replace")
示例#10
0
 def as_html(self, inline=False):
     div = Tag(name='div')
     if inline:
         div.append(Comment(str(self)))
         pass
     else:
         p = Tag(name='p')
         p.append('Location not known more precisely.')
         div.append(p)
     return div
示例#11
0
    def _tweakHTML(self, soup, manifest, swJS):
        #TODO: adding a DOCTYPE seems to mess with the finished game's layout, a browser issue, quirks mode?...
        #prefix with <!DOCTYPE html>...
        #doctype = Doctype('html')
        #soup.insert(0, doctype)


        #tweak head...
        head = soup.head

        comment = Comment("This file has been modified by pwap8 (https://github.com/loxodromic/pwap8)")
        head.insert(0, comment)

        #add some meta tags for colours, icons, etc...
        head.append(soup.new_tag('meta', attrs={'name': 'theme-color', 'content': '#cccccc'}))
        head.append(soup.new_tag('meta', attrs={'name': 'apple-mobile-web-app-capable', 'content': 'yes'}))
        head.append(soup.new_tag('meta', attrs={'name': 'apple-mobile-web-app-status-bar-style', 'content':'#222222'}))
        head.append(soup.new_tag('meta', attrs={'name': 'apple-mobile-web-app-title', 'content':soup.title.string}))
        head.append(soup.new_tag('meta', attrs={'name': 'msapplication-TileImage', 'content':"images/{name}-icon-144.png".format(name=self.projectNameShort)}))
        head.append(soup.new_tag('meta', attrs={'name': 'msapplication-TileColor', 'content':'#cccccc'}))


        #favicons...
        head.append(soup.new_tag('link', attrs={'rel': 'apple-touch-icon', 'href': "images/{name}-icon-167.png.png".format(name=self.projectNameShort)}))

        if self.faviconStyle == "png":
            head.append(soup.new_tag('link', attrs={'rel':'icon', 'href':'favicon-32.png', 'type':'image/png'}))
        elif self.faviconStyle == "ico":
            head.append(soup.new_tag('link', attrs={'rel':'icon', 'href':'favicon.ico', 'type':'image/x-icon'}))


        #manifest...
        if self.bInlineManifest:
            manifestStr = json.dumps(manifest, indent=4, sort_keys=False)
            head.append(soup.new_tag('link', attrs={'rel':'manifest', 'href':'data:application/manifest+json,' + manifestStr}))
        else:
            head.append(soup.new_tag('link', attrs={'rel':'manifest', 'href':"{name}.manifest".format(name=self.projectNameShort)}))


        #tweak body...
        body = soup.body

        #something for when JavaScrript is off...
        fallbackContent = soup.new_tag("noscript")
        fallbackContent.string = "This will much be more fun with JavaScript enabled."
        body.append(fallbackContent)


        #service worker...
        #TODO: can we inline the service worker?...
        startSW = soup.new_tag("script", attrs={'type':'text/javascript'})
        startSW.string = "window.onload = () => { 'use strict'; if ('serviceWorker' in navigator) { navigator.serviceWorker.register('./sw.js');}}"
        body.append(startSW)
示例#12
0
def inject_live_server_script(path):
    with open(path) as fp:
        soup = BeautifulSoup(fp, features='html.parser')
        head = soup.find('head')
        if head is None:
            head_tag = soup.new_tag('head')
            soup.append(head_tag)
            head = soup.find('head')
        live_server_script_tag = soup.new_tag(name='script',
                                              attrs={'src': '/liveServer.js'})
        head.append(Comment('injected by live-server'))
        head.append(live_server_script_tag)
        b_soup = soup.encode()
        return b_soup
示例#13
0
def inject_live_server_script(path):
    try:
        with open(path) as fp:  # TODO use passed path
            soup = BeautifulSoup(fp, features='html.parser')
            head = soup.find('head')
            if head is None:
                head_tag = soup.new_tag('head')
                soup.append(head_tag)
                head = soup.find('head')
            live_server_script_tag = soup.new_tag(
                name='script', attrs={'src': '/liveServer.js'})
            head.append(Comment('injected by live-server'))
            head.append(live_server_script_tag)
            b_soup = soup.encode()
            return b_soup
    except FileNotFoundError:
        # TODO throw or send 404
        return "noo"
    def getThisComments(self, retreiveCount = None):
        '''
            Get a list of comments under this blog.

            Parameters
            ====================================

            retreiveCount   `None|int` - The topmost k comments, all if None is given.

            Returns
            ====================================

            `list(Comment)`  - a list of Comment objects.
        '''
        cur = self.conn.cursor()
        cur.execute("SELECT * FROM comments WHERE blog_id = " + str(self.blog_id) + ((" ORDER BY RANDOM() LIMIT " + str(retreiveCount)) if retreiveCount is not None else ""))
        comments = cur.fetchall()
        return [Comment(*c) for c in comments]
    def getCommentsFromSimilarTags(self, retreiveCount = 5):
        '''
            Get a list of comments with the same tags as this blog.

            Parameters
            ====================================

            retreiveCount   `None|int` - The topmost k comments, all if None is given.

            Returns
            ====================================

            `list(Comment)`  - a list of Comment objects.

        '''
        cur = self.conn.cursor()

        # Select a list of comments.
        cur.execute("SELECT * FROM comments WHERE blog_id IN (" + ",".join([str(b.blog_id) for b in self.getSimilarBlogsByTags(retreiveCount)]) + ") ORDER BY RANDOM()" + ("" if retreiveCount is None else (" LIMIT " + str(retreiveCount))))
        return [Comment(*c) for c in cur.fetchall()]
    def getCommentsFromSimilarTFIDF(self, conn2 = sqlite3.connect(DB_FILE2), topK = 10, retreiveCount = 5):
        '''
            Get a list of comments with the same tfidf as this blog.

            Parameters
            ====================================

            conn2  `sqlite3.Connection` - A SQLite connection object for the word dictionary. Default as the a new connection to the global DB_FILE2 databse file.
            topK            `int`       - The top-K tf-idf words to be selected for comparisons.
            retreiveCount   `None|int` - The topmost k comments, all if None is given.

            Returns
            ====================================

            `list(Comment)`  - a list of Comment objects.

        '''
        cur = self.conn.cursor()

        # Select a list of comments.
        cur.execute("SELECT * FROM comments WHERE blog_id IN (" + ",".join([str(b.blog_id) for b in self.getSimilarBlogsByTFIDF(conn2, topK, retreiveCount)]) + ") ORDER BY RANDOM()" + ("" if retreiveCount is None else (" LIMIT " + str(retreiveCount))))
        return [Comment(*c) for c in cur.fetchall()]
from bs4 import BeautifulSoup, Comment
import requests

soup = BeautifulSoup('<b><!--Yo soy un comentario HTML--></b>', 'html.parser')
print(soup.b.string)
print(type(soup.b.string))

comment = Comment(' #mycomment ')
soup.b.string.replace_with(comment)
print(soup.b)
print(type(soup.b.string))
示例#18
0
for j in range((len(singlewords) - 1) / wordsperquery + 1):
    words = singlewords[j * wordsperquery:(j + 1) * wordsperquery]
    if len(words) > 1:
        searches.append('("' + '" OR "'.join(words) + '")')
    else:
        searches.append('"' + words[0] + '"')
for words in multwords:
    searches.append('(' + ' AND '.join(words) + ')')
startyear = str(now.year - 1)
stopyear = str(now.year + 1)
startsearch = int(sys.argv[1])
stopsearch = int(sys.argv[2])
rpp = 30
chunksize = 50
publisher = 'oatd.org'
comment = Comment('Kommentar')
#check already harvested
ejldirs = [
    '/afs/desy.de/user/l/library/dok/ejl/onhold',
    '/afs/desy.de/user/l/library/dok/ejl',
    '/afs/desy.de/user/l/library/dok/ejl/zu_punkten',
    '/afs/desy.de/user/l/library/dok/ejl/zu_punkten/enriched',
    '/afs/desy.de/user/l/library/dok/ejl/backup',
    '/afs/desy.de/user/l/library/dok/ejl/backup/%i' % (now.year - 1)
]
redoki = re.compile('THESES.OATD')
renodoi = re.compile('^I\-\-NODOI:(.*)\-\-$')
bereitsin = []
for ejldir in ejldirs:
    print ejldir
    for datei in os.listdir(ejldir):
示例#19
0
def main(qhp_file):
    """TODO: Docstring for main.
  :returns: TODO

  """
    src_file = open(qhp_file)
    dst_file = open(
        os.path.dirname(src_file.name) + os.path.sep + 'index-fix.qhp', 'w')
    soup = BeautifulSoup(src_file, 'xml')
    keywords = soup.findAll('keyword')
    for keyword in keywords:
        kid = keyword['id']
        m = re.match(r'lav.*::(.*)$', kid)
        if m:
            keyword['id'] = m.group(1)
        pass

    # DO NOT use soup.prettify
    # qhelpgenerator CAN NOT recognize the format
    dst_file.write(str(soup))

    # see https://github.com/mmmarcos/doxygen2qtcreator/blob/master/doxygen2qtcreator.py
    # popup tooltips for function call MUST have the format
    # <!-- $$$function_name[overload1]$$$ -->
    # <div class='memdoc'>
    # <p>Only the first p tag can be show in popup tooltips/hover documentation</p>
    # <!-- @@@function_name -->
    # ...
    # ...
    # </div>
    files = soup.find_all('file')
    common_dir = os.path.dirname(src_file.name)
    for f in files:
        html_file = open(common_dir + os.path.sep + f.text, 'rw+')
        if html_file:
            html_soup = BeautifulSoup(html_file, 'html.parser')
            memitems = html_soup.find_all('div', {'class': 'memitem'})
            should_write_back_to_file = False
            if memitems:
                for item in memitems:
                    memname = item.find('td', {'class': 'memname'})
                    memdoc = item.find('div', {'class': 'memdoc'})
                    if memdoc and memname > 0:
                        html_text = memname.get_text()
                        names = html_text.strip(' ').split(' ')
                        # Only handle function call name
                        # ffmpeg av_xxxxx
                        # int function_call_name
                        if len(names) == 2 and names[1].startswith('av'):
                            # TODO:merge multiple <p> tags in memdoc
                            # QtCreator only pick the first <p> tag to display in the tooltips
                            marker_start = u' $$${0}[overload1]$$$ '.format(
                                names[1])
                            marker_end = u' @@@{0} '.format(names[1])
                            memdoc.insert_before(Comment(marker_start))
                            memdoc.insert_after(Comment(marker_end))
                            should_write_back_to_file = True
                    pass
        if should_write_back_to_file:
            print 'insert QtCreator style marker for %s' % html_file.name
            html_file.seek(0)
            # DO NOT prettify
            # for the code in the html, use unicode is more readable
            html_file.write(unicode(html_soup).encode('utf-8'))
        html_file.close()
        pass

    src_file.close()
    dst_file.close()

    print 'Done, /path/to/qhelpgenerator %s -o index.qch' % dst_file.name
    print 'Then, attach index.qch file to your QtCreator'
    print 'Tool -> Options -> Help -> Documentation -> Add'
    pass
示例#20
0
 def note_error(self, elem, message, strip):
     elem.append(Comment(message))
     if strip:
         elem.parent().clear()
示例#21
0
root = os.getcwd()
target_path = root+"/images"


file = open("index.html","r")
webpage = file.read()
soup = BeautifulSoup(webpage,'html.parser')
get_target_div = soup.find('div',{'id':'lightgallery'})
photo_list = [photos['href'].replace('images/','').lower() for photos in get_target_div.find_all('a')]
# print(photo_list)
pointer = soup.find('div', {'id': 'lightgallery'})
for dirName, subdirList, fileList in os.walk(target_path, topdown=False):
    rel_dir = os.path.relpath(dirName, target_path)
    if rel_dir.startswith('.'):
        continue
    comment_tag = Comment(" "+rel_dir.upper()+" ")
    pointer.append(comment_tag)
    print('=== %s ===' % comment_tag)
    for fname in fileList:

        if fname.startswith('.'):
            continue
        if "thumb-" in fname:
            continue
        if fname.lower() not in photo_list:
            new_soup = BeautifulSoup("", "html.parser")
            new_tag = new_soup.new_tag("a",attrs={'class':"grid-item", 'href':"images/"+rel_dir+"/"+fname})
            new_soup.append(new_tag)

            new_tag = new_soup.new_tag("img", attrs={'src':"images/"+rel_dir+"/thumb-"+fname})
            new_soup.a.append(new_tag)
示例#22
0
def replacement(cursor,
                wp_posts,
                shorten_url_dict,
                shorten_url_keys,
                cat_dict,
                post_tag_dict,
                imported_idd={}):
    features = get_features(cursor)
    feature_ids = [f['term_id'] for f in features]
    wp_post_lists = []
    wp_post_dict = {}
    h = HTMLParser()
    for wp_post in wp_posts:
        # extract wplink and remove all [wplink ...] in content.
        matches = re.findall(r'(\[wplink name="(.*)"\])',
                             wp_post['post_content'])
        short_link_dict = {}
        short_links = []
        for i in matches:
            full, part = i
            if part in shorten_url_keys:
                short_links.append(part)

        if len(short_links) > 0:
            z = hashlib.md5(''.join(sorted(short_links))).hexdigest()
            x = {}
            for short_link in short_links:
                x[short_link] = [
                    shorten_url_dict[short_link]['link'],
                    shorten_url_dict[short_link]['title']
                ]

            wp_post['wplink'] = [z, x]

        # fix newline at <span> & オススメ記事
        wp_post['post_content'] = wp_post['post_content'].replace(
            '\r\n<span', '\r\n\r\n<span')

        # add more 1 newline
        add_newline_lists = [
            '</h1>', '</h2>', '</h3>', '</h4>', '</h5>', '</table>', '</p>',
            '</blockquote>', '</ul>', '</ol>'
        ]
        for add_newline_list in add_newline_lists:
            wp_post['post_content'] = wp_post['post_content'].replace(
                add_newline_list, add_newline_list + "\r\n")

        # add <br> if needed
        lists_without_br = [
            '<table', '<thead', '<tbody', '<td', '<th', '<tr', '</table>',
            '</thead>', '</tbody>', '</td>', '</th>', '</tr>', '<p>', '</p>',
            '</li>'
        ]
        ts = wp_post['post_content'].split('\r\n\r\n')
        for i, v in enumerate(ts):
            t = ts[i].strip()
            need_replace = True
            for lwb in lists_without_br:
                if t.find(lwb) != -1:
                    need_replace = False
                    break

            if need_replace:
                ts[i] = ts[i].replace('\r\n', '<br>\r\n')

        wp_post['post_content'] = '\r\n\r\n'.join(ts)

        # remove width & height attribute
        wp_post['post_content'] = re.sub(r'(.*) width="\d+"(.*)', r'\1\2',
                                         wp_post['post_content'])
        wp_post['post_content'] = re.sub(r'(.*) height="\d+"(.*)', r'\1\2',
                                         wp_post['post_content'])

        # replace [caption] to html format
        wp_post['post_content'] = re.sub(
            r'\[caption(.*)\](.*>)(.*)\[\/caption\]',
            r'<div class="media">\2<div class="caption">\3</div></div>',
            wp_post['post_content'])

        # remove [nextpage]
        #wp_post['post_content'] = re.sub(r'\[\/nextpage\]', '', wp_post['post_content'])
        #wp_post['post_content'] = re.sub(r'\[nextpage(.*)\]', '', wp_post['post_content'])

        pid = wp_post['ID']
        wp_post_dict[pid] = wp_post
        wp_post_dict[pid]['meta'] = {}
        wp_post_dict[pid]['related_article_ids'] = []
        wp_post_dict[pid]['related_article_titles'] = []

        wp_postmeta_result = get_wp_metadata_by_post_id(cursor, pid)
        for wp_postmeta in wp_postmeta_result:
            wp_post_dict[wp_postmeta['post_id']]['meta'][
                wp_postmeta['meta_key']] = wp_postmeta['meta_value']

            if wp_postmeta['meta_key'] == 'simple_related_posts':
                # convert related_articles
                ra_ids = sorted(list(
                    set(
                        map(
                            int,
                            re.findall(
                                r'"(\d+)"',
                                wp_post_dict[wp_postmeta['post_id']]['meta']
                                ['simple_related_posts'])))),
                                reverse=True)
                ra_ids = [
                    ra_id for ra_id in ra_ids if
                    not check_if_fisco(cursor, ra_id) and ra_id in imported_idd
                ]
                wp_post_dict[
                    wp_postmeta['post_id']]['related_article_ids'] = ra_ids
                # XXX: set default title
                wp_post_dict[
                    wp_postmeta['post_id']]['related_article_titles'] = [
                        'x' for _ in ra_ids
                    ]
                del wp_post_dict[wp_postmeta['post_id']]['meta'][
                    wp_postmeta['meta_key']]

    for k in wp_post_dict:
        _wp_post = wp_post_dict[k]

        # fix html_content. change double newline into <p> tag.
        sps = _wp_post['post_content'].split('\r\n\r\n')
        for idx, val in enumerate(sps):
            if sps[idx][:3] != '<p>':
                sps[idx] = '<p>{}</p>'.format(val)
        _wp_post['post_content'] = '\r\n'.join(sps)

        # insert <br> after some tags.
        _wp_post['post_content'] = re.sub(r'</strong>\r\n',
                                          '</strong><br>\r\n',
                                          _wp_post['post_content'])
        _wp_post['post_content'] = re.sub(r'</a>\r\n', '</a><br>\r\n',
                                          _wp_post['post_content'])
        _wp_post['post_content'] = re.sub(r'<p>【(.*)オススメ(.*)】\r\n',
                                          r'<p>【\g<1>オススメ\g<2>】<br>\r\n',
                                          _wp_post['post_content'])

        # create soup
        post_content_soup = BeautifulSoup(_wp_post['post_content'], "lxml")
        # remove class,id,name and style in html.
        for tag in post_content_soup():
            if isinstance(tag, Tag):
                for attribute in ["class", "id", "name", "style"]:
                    if tag.name == 'div' and 'class' in tag.attrs and (
                            'media' in tag.attrs['class']
                            or 'caption' in tag.attrs['class']):
                        continue
                    del tag[attribute]

        # fix html_content. wrap NavigableString into a <p> tag.
        for k, v in enumerate(post_content_soup.body.findAll(recursive=False)):
            if isinstance(v, NavigableString):
                new_p_tag = post_content_soup.new_tag('p')
                if post_content_soup.body.contents[k].strip() == 'nextpage':
                    new_p_tag.append(Comment('nextpage'))
                else:
                    new_p_tag.string = unicode(v)

                post_content_soup.body.contents[k] = new_p_tag

        post_content_soup.html.unwrap()
        post_content_soup.body.unwrap()

        # process <blockquote>
        for match in post_content_soup.findAll('blockquote'):
            mf = match.findAll(recursive=False)
            match.contents = [m for m in match.contents if m != '\n']
            for k, v in enumerate(mf):
                if isinstance(v, Tag) and v.name != 'p' and v.name != 'br':
                    new_p_tag = post_content_soup.new_tag('p')
                    new_p_tag.string = v.text
                    match.contents[k] = new_p_tag

            if len(mf) == 0:
                new_p_tag = post_content_soup.new_tag('p')
                new_p_tag.string = match.text
                match.string = ''
                match.insert(0, new_p_tag)

        # remove span
        for match in post_content_soup.findAll('span'):
            match.replaceWithChildren()

        # remove <a> outside of <img>
        for match in post_content_soup.findAll('img'):
            if isinstance(match.parent, Tag) and match.parent.name == 'a':
                try:
                    if re.match(r'.*\.(jpg|png|gif|bmp)',
                                match.parent['href']).group():
                        match.parent.unwrap()
                except:
                    pass
            #try:
            #    new_br_tag = post_content_soup.new_tag('br')
            #    match.parent.insert(-1, new_br_tag)
            #except:
            #    pass

            #if isinstance(match.parent, Tag) and match.parent.name == 'p':
            #    match.parent['style'] = 'text-align: center;'

        # wrap div outside of table
        for v in post_content_soup.findAll('table'):
            new_div_tag = post_content_soup.new_tag('div',
                                                    **{'class': 'tableWrap'})
            contents = v.replace_with(new_div_tag)
            new_div_tag.append(contents)

        # wrap div outside of iframe which src is youtube.com/xxx
        for v in post_content_soup.findAll('iframe'):
            if v['src'] is not None and v['src'].find('www.youtube.com') != -1:
                new_div_tag = post_content_soup.new_tag(
                    'div', **{'class': 'youtube'})
                contents = v.replace_with(new_div_tag)
                new_div_tag.append(contents)

        # process <!--nextpage-->
        comments = post_content_soup.find_all(
            string=lambda text: isinstance(text, Comment))
        for comment in comments:
            if comment == 'nextpage':
                pp = comment.parent
                try:
                    ct = 1
                    pps = pp.find_previous_sibling()
                    while True:
                        if ct > 5:
                            break

                        if len(pps.findChildren('a')) > 0:
                            pps.extract()
                            break
                        else:
                            pps = pps.find_previous_sibling()
                            ct += 1

                    pp.unwrap()
                except:
                    pass

        _wp_post['post_content'] = post_content_soup.prettify(indent_width=2)

        # cleanup empty tags
        _wp_post['post_content'] = _wp_post['post_content'].replace(
            '<p>\n  <br/>\n</p>', '')
        _wp_post['post_content'] = _wp_post['post_content'].replace(
            '<p>\n</p>', '')

        # replace <a> tag which values are https://localhost.com/archives/ZZZ
        reps = []
        a_tags = post_content_soup.findAll('a')
        for a_tag in a_tags:
            try:
                matches = re.search(r'^https:\/\/localhost.com\/archives',
                                    a_tag['href'])
                if matches is not None:
                    reps.append(a_tag['href'])
            except:
                pass

        # replace absolute link into relative.
        for rep in reps:
            r = rep.split('https://localhost.com/archives')[1]
            _wp_post['post_content'] = _wp_post['post_content'].replace(
                rep, '/archives' + r)

        # XXX: fix [nextpage] format error
        if _wp_post['ID'] in fix_nextpage_dicts.keys():
            for tp in fix_nextpage_dicts[_wp_post['ID']]:
                _wp_post['post_content'] = _wp_post['post_content'].replace(
                    *tp)

        # unescape html
        _wp_post['post_content'] = h.unescape(_wp_post['post_content'])

        # trim html tags
        _content = post_content_soup.text

        # validate meta key
        for k in [
                '_aioseop_keywords', '_aioseop_description', '_aioseop_title',
                'subtitle'
        ]:
            if k not in _wp_post['meta']:
                _wp_post['meta'][k] = ''

        _wp_post['post_content'] = _wp_post['post_content'].replace(
            'https://localhost.com/wp-content/uploads/',
            'https://stg.localhost/640/480/uploads/')

        _post = {
            'id': _wp_post['ID'],
            'operator_id': 0,  # TODO:
            'author_id': _wp_post['post_author'],
            'editor_id': 1,
            'category_id': 0,
            'image_id': 1,
            'company_id': 0,
            'title': _wp_post['post_title'],
            'content': _content,
            'lead_content': _content[:140],
            'html_content': _wp_post['post_content'],
            'sub_title': _wp_post['meta']['subtitle'],
            'meta_description': _wp_post['meta']['_aioseop_description'],
            'meta_keywords': _wp_post['meta']['_aioseop_keywords'],
            'meta_title': _wp_post['meta']['_aioseop_title'],
            'noindex_flg': False,
            'nofollow_flg': False,
            'nolist_flg': False,
            'ogp_image_config': 1,
            'twitter_card': 2,
            'amp_flg': False,
            'instant_articles_flg': False,
            'status': 1,
            'trash_flg': False,
            'created_at': _wp_post['post_date'],
            'updated_at': _wp_post['post_modified'],
            'image_urls': [],
            'related_article_ids': _wp_post['related_article_ids'],
            'related_article_titles': _wp_post['related_article_titles'],
            #'image_urls': [img['src'] for img in post_content_soup.findAll('img') if 'src' in img],
        }

        for img in post_content_soup.findAll('img'):
            try:
                isrc = img['src']
                _post['image_urls'].append(isrc)
            except:
                pass

        if 'wplink' in _wp_post:
            _post['wplink'] = _wp_post['wplink']

        if _wp_post['post_status'] == 'publish' or _wp_post[
                'post_status'] == 'future':
            _post['published_at'] = _wp_post['post_date']

        # change to features when import
        if 'series' in _wp_post['meta'] and _wp_post['meta']['series'] != "":
            _post['series_id'] = _wp_post['meta']['series']
        else:
            # query =>  select * from wp_term_relationships where term_taxonomy_id = 774;
            se = xs(cursor, feature_ids)
            if se is not None:
                _post['series_id'] = se['term_taxonomy_id']
            else:
                _post['series_id'] = 0

        ctrls = []
        try:
            ctrls = phpserialize.loads(_wp_post['meta']['pr']).values()
        except:
            pass

        _post['is_pr'] = '588' in ctrls
        _post['is_hide'] = '587' in ctrls

        if _post['is_hide']:
            _post['nolist_flg'] = True

        try:
            if _wp_post['meta']['_aioseop_noindex'] == 'on':
                _post['noindex_flg'] = True
        except:
            pass

        try:
            if _wp_post['meta']['_aioseop_nofollow'] == 'on':
                _post['nofollow_flg'] = True
        except:
            pass

        ## START add categories relations into post
        sql = "SELECT * FROM wp_term_relationships where object_id = {}".format(
            _wp_post['ID'])
        cursor.execute(sql)
        wp_term_relationships_result = cursor.fetchall()
        for wtr in wp_term_relationships_result:
            if wtr['term_taxonomy_id'] in cat_dict:
                _post['category_id'] = cat_dict[
                    wtr['term_taxonomy_id']]['term_id']
                break
        ## END

        ## START add tags relations into post
        _post['tag_ids'] = []
        is_fisco = False
        for wtr in wp_term_relationships_result:
            if wtr['term_taxonomy_id'] in post_tag_dict:
                # check if article is Fisco
                if post_tag_dict[wtr['term_taxonomy_id']]['term_id'] == 541:
                    is_fisco = True

                _post['tag_ids'].append(
                    post_tag_dict[wtr['term_taxonomy_id']]['term_id'])
                _pid = post_tag_dict[wtr['term_taxonomy_id']]['parent']
                while _pid != 0:
                    if _pid not in post_tag_dict:
                        break

                    _post['tag_ids'].append(post_tag_dict[_pid]['term_id'])
                    _pid = post_tag_dict[_pid]['parent']

        # Don't import Fisco articles
        if not is_fisco:
            wp_post_lists.append(_post)

    return wp_post_lists
示例#23
0
def cleanup_soup(soup):
    # Remove existing comments
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        comment.extract()

    # Remove the wikimedia TOC (there are 3 tags to remove)
    for tag in soup.find_all(name='span', id='Contents'):
        tag = tag.parent
        tag = tag.next_sibling
        tag = tag.next_sibling
        tag.previous_sibling.extract()
        tag.previous_sibling.extract()
        tag.extract()

    # Remove more wikimedia cruft (sidebar, footer)
    for tag in soup.find_all():
        if tag.has_attr('id'):
            if tag['id'] == 'jump-to-nav':
                tag.extract()
            if tag['id'] == 'siteSub':
                tag.extract()
            if tag['id'] == 'contentSub':
                tag.extract()
            if tag['id'] == "column-one":
                tag.extract()
            if tag['id'] == "footer":
                tag.extract()
            if tag['id'] == "toc":
                tag.extract()

    # ul tag may be bad and need an li.
    # html is fine without, but the latexified version would
    # otherwise baulk at a missing \item.
    for tag in soup.find_all(name='ul'):
        if not tag.contents[0].name == 'li':
            print("Bad ul tag fixed")
            tag.insert(0, Comment("\\item "))

    # Our two column mode
    # Each file is a chapter, starting at h1.
    # And with the 2-col environment inside it.
    # So go do that for this html
    tag = soup.body
    if tag:
        # The title is one column
        tag2 = soup.find('h1')
        # The empty argument to label_of_ref will give a label for this
        # source file, at its start.
        if tag2:
            tag2.insert_after(
                Comment('latex \\label{' + label_of_ref('') + '}'))
            tag2.insert_after(Comment('latex \\begin{multicols}{2}'))
        else:
            tag.insert(0, Comment('latex \\begin{multicols}{2}'))
            tag.insert(0, Comment('latex \\label{' + label_of_ref('') + '}'))
        tag.insert(-1, Comment('latex \\end{multicols}'))

    # Most text is two column.  Fix up the sections marked as full width.
    for tag in soup.find_all(name='div', class_="full-width"):
        tag.insert(0, Comment('\\end{multicols}\n'))
        tag.insert(-1, Comment('\\begin{multicols}{2}\n'))

    # anchors become \hyperrefs and \labels
    # provided they are relative.
    for tag in soup.find_all(name='a'):
        if tag.has_attr('href'):
            if not tag.find(name='img'):
                if not tag['href'].startswith('http'):
                    label = label_of_ref(tag['href'])
                    #print( "hyperref: ", label )
                    tag.insert_before(
                        Comment('latex \n\\hyperref[\\foo{' + label + '}]{'))
                    tag.insert_after(Comment('latex }\n'))

    # divs may provide \labels
    for tag in soup.find_all(name='div'):
        if tag.has_attr('id') and not tag.contents:
            label = label_of_ref(tag['id'])
            #print( "label: ", label )
            #insert label after the heading, if there was one
            #(this gets more accurate LaTeX hyperlink landings)
            #otherwise just insert it anyway.
            tag2 = tag.find_next_sibling(re.compile('^h\d'))
            if tag2:
                tag2.insert_after(Comment('latex \n\\label{' + label + '}'))
            else:
                print('No title for ' + label)
                tag.insert_before(Comment('latex \n\\label{' + label + '}'))

    # (valid) images get treated depending on their size
    # all our images are screenshots, so we just check sizes in pixels.
    #  - small images are inline, and are already sized (using dpi) for inline use
    #  - large images are 72 dpi, and will be at most one column width.
    for tag in soup.find_all(name='img'):
        if tag.has_attr('src'):
            png_filename = abs_filename_of_relative_link(tag['src'])
            if os.path.isfile(png_filename):
                with Image.open(png_filename) as image:
                    siz = image.size
                    if tag.has_attr('usemap'):
                        # no \par needed or used for image map.
                        tag = add_image_map(tag, siz)
                    elif siz[0] > 60 or siz[1] > 30:
                        #Bigger images...
                        #print( png_filename )
                        tag.insert_before(
                            Comment(
                                '\n\\par\\includegraphics[max width=\\linewidth]{'
                            ))
                        tag.insert_after(Comment('}\\par\n'))
                    else:
                        #small inline image
                        #the \texorpdfstring is because these inline images
                        #may occur in section headings, and without \texorpdfstring
                        #that would mess up the conversion to pdf which does not like
                        #images in the labels.
                        tag.insert_before(
                            Comment(
                                '\\texorpdfstring{\\protect\\includegraphics[max width=\\linewidth]{'
                            ))
                        tag.insert_after(Comment('}}{}'))
            # file name is used by includegraphics, so put it in.
            tag.insert(0, Comment(png_filename.replace('\\', '/')))
示例#24
0
# поиск тега, можно использовать css-селекторы или вместе

bs.find_all(name, attrs, recursive, string, limit, **kwargs)
# поиск и получение списка найденных элементов, можно искать
# с помощью регулярок, сразу список элементов, можно передать
# True он найдёт все теги.

bs.select('p #author')
# возвращает список объектов Tag, предоставляющий в BeautifulSoup
# HTML-элементы. Поиск с помощью гибкости css-селекторов

bs.select_one(css_selector)
# поиск первого тега соответствующего селектору

bs.new_tag("a", href="http://www.example.com")
# создаёт новый тег

bs.stripped_strings
#

bs.original_encoding
# автоопределение кодировки



NavigableString('example text')
# конструктор контента, который затем можно вставить в тег

Comment('this comment')
# конструктор комментария, который затем можно вставить в 
# документ
示例#25
0
def htmlPreview(data):

    if "OPENSHIFT_APP_UUID" in os.environ:
        html_file = './app-root/repo/data/Email Security/template/Cisco_Email_Security_NLG_Template_v1.html'
    else:
        html_file = './data/Email Security/template/Cisco_Email_Security_NLG_Template_v1.html'

    file = codecs.open(html_file, 'r', 'utf-8')
    soup = BeautifulSoup(file, "html.parser")

    # Adding Main heading and sub heading

    main_heading = soup.find(id="main_heading")
    cisco_img = main_heading.td
    text_heading = main_heading.td.find_next('td')
    text_heading.string = data['intromainheading']
    text_heading.append(soup.new_tag('br'))
    text_heading.append(soup.new_string(data['introsubheading']))

    #Adding Customer name in email
    customer_name = soup.find(id='customername')

    customer_name.td.string.replace_with('\r\n\t\t\t\tHi {},\r\n'.format(
        data['first_name']))

    # Finding introtext in html
    introtext_1 = soup.find(id='introtext1')

    # Replacing introtext with data
    introtext1 = {k: v for k, v in data.items() if k.startswith("introtext1")}

    i = 0
    for key, value in sorted(introtext1.items()):
        if (i == 0):
            introtext_1.string = value
            i = i + 1
        else:
            introtext_1.append(soup.new_tag('br'))
            introtext_1.append(soup.new_string(value))

    # Finding recommendation link in html
    r_link = soup.find(id='recommendation')

    r_link['href'] = data['recommendationlink']

    # Finding introtext in html
    introtext_2 = soup.find(id='introtext2')

    # Replacing Second introtext with data
    introtext2 = {k: v for k, v in data.items() if k.startswith("introtext2")}

    i = 0
    for key, value in sorted(introtext2.items()):
        if (i == 0):
            introtext_2.string = value
            i = i + 1
        else:
            introtext_2.append(soup.new_tag('br'))
            introtext_2.append(soup.new_string(value))

    # Replacing status icon for feautres

    sicon = soup.find(id="statusicon")
    # print(sicon)

    sicon.img['src'] = data['statusicon']

    #Replacing 2nd main heading

    heading2main = soup.find(id="heading2main")

    heading2main.string = data['headingtwomain']

    heading2sub = soup.find(id="heading2sub")

    heading2sub.string = data['heading2sub']

    #Updating features details

    features = soup.find(id="features_status")

    features_text = {
        k: v
        for k, v in data.items()
        if k.startswith("feature") & k.endswith("text")
    }

    new_tag = {}
    j = 1

    for i in range(1, len(features_text) + 1):

        if features_text['feature{}text'.format(i)] != '':

            new_tag["feature{}".format(j)] = copy.copy(features.tr)

            new_tag["feature{}".format(j)].td.string = features_text[
                'feature{}text'.format(i)]
            new_tag["feature{}".format(j)].img['src'] = data[
                'feature{}statusimg'.format(i)]
            j += 1

    features.tbody.clear()
    # features.decompose()

    print("length of new_tag", len(new_tag))
    if (len(new_tag) == 0):
        features.decompose()
    else:
        for k in range(1, len(new_tag) + 1):
            features.tbody.append(Comment("LICENSE STATUS " + str(k)))
            features.tbody.append(new_tag["feature{}".format(k)])
            features.tbody.append(Comment("LICENSE STATUS END " + str(k)))
        # for key,value in new_tag.items():
        #     print("came here")
        #     features.tbody.append( Comment("LICENSE STATUS FOR " + str(key)))
        #     features.tbody.append(value)
        #     features.tbody.append( Comment("LICENSE STATUS END FOR" + str(key)))

    #Activation link
    activation_link = soup.find(id="activationlink")

    if (data['clicktoactivatelink'] == ''):
        activation_link.decompose()
    else:
        activation_link['href'] = data['clicktoactivatelink']

    # Contact details

    heading3 = soup.find(id="heading3main")

    heading3.strong.string = data['heading3main']

    text3 = soup.find(id="text3")

    text3.string = data['textbox3']

    outtext = soup.find(id="outrotext")

    outtext.string = data['outrotext']

    html_content = soup.prettify()

    return html_content
    def buildTFIDF(self):
        # Create new tf-idf tables
        cur2 = self.conn2.cursor()
        print("DB Initiation - Creating tf-idf tables")
        cur2.execute('''DROP TABLE IF EXISTS blogs_tf_idf''')
        cur2.execute('''DROP TABLE IF EXISTS blogs_title_tf_idf''')
        cur2.execute('''DROP TABLE IF EXISTS comments_tf_idf''')
        self.conn2.commit()
        cur2.execute('''CREATE TABLE blogs_tf_idf
                    (blog_id    INTEGER, 
                    word_id     INTEGER,
                    count       INTEGER,
                    tf_idf      FLOAT,
                    PRIMARY KEY(blog_id,word_id),
                    FOREIGN KEY(word_id) REFERENCES word_dict(id))''')
        self.conn2.commit()
        cur2.execute('''CREATE TABLE blogs_title_tf_idf
                    (blog_id    INTEGER, 
                    word_id     INTEGER,
                    count       INTEGER,
                    tf_idf      FLOAT,
                    PRIMARY KEY(blog_id,word_id),
                    FOREIGN KEY(word_id) REFERENCES word_dict(id))''')
        self.conn2.commit()
        cur2.execute('''CREATE TABLE comments_tf_idf
                    (blog_id    INTEGER, 
                    comment_id  INTEGER,
                    word_id     INTEGER,
                    count       INTEGER,
                    tf_idf      FLOAT,
                    PRIMARY KEY(blog_id,comment_id,word_id),
                    FOREIGN KEY(word_id) REFERENCES word_dict(id))''')
        self.conn2.commit()
        

        print("DB TFIDF Initialization - Loop Entries")
        cur = self.conn.cursor()
        # Select the title and blog ids form all the blogs
        allEntries = cur.execute("SELECT blog_id,title,body FROM blogs" + ("" if self.rowLimit is None else (" LIMIT " + str(self.rowLimit))))
        blogsTFIDF = dict()
        blogsTitleTFIDF = dict()
        commentsTFIDF = dict()
        idx = 0

        # Loop all the blogs for tf-idf preparation
        blogCount = Blog.getCount(self.conn) if self.rowLimit is None else self.rowLimit
        for i in allEntries:
            # Segment the title and push into the counter
            allWordsTitle = self.transformTextToIDs(i[1])
            titleCounter = collections.Counter(allWordsTitle)
            eleLen = sum(titleCounter.values())
            # There may be cases with no valid words found
            if (eleLen > 0):
                blogsTitleTFIDF[i[0]] = {w[1]: (ctn, ctn/eleLen*w[4]) for w,ctn in titleCounter.items()}
           
            # Segment the body and push into the counter
            allWordsBody = self.transformTextToIDs(i[2])
            bodyCounter = collections.Counter(allWordsBody)
            eleLen = sum(bodyCounter.values())
            # There may be cases with no valid words found
            if (eleLen > 0):
                blogsTFIDF[i[0]] = {w[1]: (ctn, ctn/eleLen*w[4]) for w,ctn in bodyCounter.items()}

            # Get the comments and push all the words
            comments = Comment.getFromDB(i[0])
            commentsTFIDF[i[0]] = dict()
            for c in comments:
                allWordsComment = self.transformTextToIDs(c.body)
                commentCounter = collections.Counter(allWordsComment)
                eleLen = sum(commentCounter.values())
                # There may be cases with no valid words found
                if (eleLen > 0):
                    commentsTFIDF[i[0]][c.comment_id] = {w[1]: (ctn, ctn/eleLen*w[4]) for w,ctn in commentCounter.items()}

            # Log progresses
            idx+=1
            if (idx % 500 == 0):
                print("Processing... (", idx/blogCount*100, " %)")
        
        # Loop all the data and insert into the db
        titleTFIDFLen = len(blogsTitleTFIDF)
        idx = 0
        for blog_id,titleWords in blogsTitleTFIDF.items():
            for word_id,titleTfidf in titleWords.items():
                cur2.execute("INSERT INTO blogs_title_tf_idf VALUES(" + str(blog_id) + ", " + str(word_id) + ", " + str(titleTfidf[0]) + ", " + str(titleTfidf[1]) + ")")
            
            # Log progresses
            idx += 1
            if (idx % 500 == 0):
                print("Processing - Blog Titles ... (", idx/titleTFIDFLen*100, " %)")
        
        # Loop all the data and insert into the db
        blogTFIDFLen = len(blogsTFIDF)
        idx = 0
        for blog_id,blogWords in blogsTFIDF.items():
            for word_id,blogTfidf in blogWords.items():
                cur2.execute("INSERT INTO blogs_tf_idf VALUES(" + str(blog_id) + ", " + str(word_id) + ", " + str(blogTfidf[0]) + ", " + str(blogTfidf[1]) + ")")
            
            # Log progresses
            idx += 1
            if (idx % 500 == 0):
                print("Processing - Blogs ... (", idx/blogTFIDFLen*100, " %)")
        
        # Loop all the comments and insert into the db
        commentTFIDFLen = len(commentsTFIDF)
        idx = 0
        for blog_id,comments in commentsTFIDF.items():
            for comment_id,commentWords in comments.items():
                for word_id,commentTfidf in commentWords.items():
                    cur2.execute("INSERT INTO comments_tf_idf VALUES(" + str(blog_id) + ", " + str(comment_id) + ", " + str(word_id) + ", " + str(commentTfidf[0]) + ", " + str(commentTfidf[1]) + ")")
            # Log progresses
            idx += 1
            if (idx % 500 == 0):
                print("Processing - Comments ... (", idx/commentTFIDFLen*100, " %)")
        
        self.conn2.commit()
示例#27
0
from bs4 import BeautifulSoup, Comment
bsinstance = BeautifulSoup(open('ltps_parse.html'), "lxml")

#Adds a string to the tags and a comment (can be done separetely):
commenttoadd = Comment("Here's the comment my friend")
links = bsinstance.find('link')
links.append('test1')
links.append(commenttoadd)
print links
"""#Insert a string to the tag (works like append except we can choose the position):
links = bsinstance.find('link')
links.append('test1')# IF i wrote : links.clear() : The contents would be deleted, but not the attrs
links.insert(0,'test2') 
print links """
    def build(conn = sqlite3.connect(DB_FILE), conn2 = sqlite3.connect(DB_FILE2), rowLimit = None, segType = 2):
        ''' Build the dictionary of all the Chinese words and English words.

            Parameters
            ====================================

            conn    `sqlite3.Connection`    - A SQLite connection object for the data source. Default as the a new connection to the global DB_FILE databse file.
            conn2    `sqlite3.Connection`   - A SQLite connection object for the word dictionary. Default as the a new connection to the global DB_FILE2 databse file.
            rowLimit    `int`               - The limit row count of blogs to return.
            segType     `int`               - 0: by characters; 1: by characters, but remove english words; 2: by jieba

            Returns
            ====================================

            `WordDict - A dictionary object for the connection of currently building dictionary.
        '''

        cur = conn.cursor()

        # Count the number of blogs and collect all the blog ids
        if (rowLimit is None):
            cur.execute("SELECT COUNT(blog_id) FROM blogs" + ("" if rowLimit is None else (" LIMIT " + str(rowLimit))))
            blogCount = cur.fetchall()[0][0]
        else:
            blogCount = rowLimit

        # Create dictionary table in the new db
        cur2 = conn2.cursor()
        print("DB Initiation - Creating dictionary table")
        cur2.execute('''DROP TABLE IF EXISTS word_dict''')
        cur2.execute('''DROP TABLE IF EXISTS blogs_tf_idf''')
        cur2.execute('''DROP TABLE IF EXISTS blogs_title_tf_idf''')
        cur2.execute('''DROP TABLE IF EXISTS comments_tf_idf''')
        conn2.commit()
        cur2.execute('''CREATE TABLE word_dict
                    (word    TEXT, 
                    id       INTEGER,
                    count    INTEGER,
                    freq     FLOAT,
                    idf      FLOAT,
                    PRIMARY KEY(id))''')
        conn2.commit()

        wordDict = WordDict(conn, conn2, segType=segType, rowLimit=rowLimit);


        print("DB Initiation - Loop Entries")
        # Select the title and blog ids form all the blogs
        allEntries = cur.execute("SELECT blog_id,title,body FROM blogs" + ("" if rowLimit is None else (" LIMIT " + str(rowLimit))))
        wordCount = dict()
        idx = 0
        wordDict.initalCorpusCount()
        corpusCount = wordDict.corpusCount

        # Loop all the blogs for dictionary preparation
        for i in allEntries:
            # Segment the title and push into the counter
            allWordsTitle = WordDict.segment(i[1], segType = segType)
            wordsTitle = set(allWordsTitle)
            for w in wordsTitle:
                wordCount[w] = wordCount.setdefault(w, 0) + 1
            
            # Segment the body and push into the counter
            allWordsBody = WordDict.segment(i[2], segType = segType)
            wordsBody = set(allWordsBody)
            for w in wordsBody:
                wordCount[w] = wordCount.setdefault(w, 0) + 1

            # Get the comments and push all the words
            comments = Comment.getFromDB(i[0])
            for c in comments:
                allWordsComment = WordDict.segment(c.body, segType = segType)
                wordsComment = set(allWordsComment)
                for w in wordsComment:
                    wordCount[w] = wordCount.setdefault(w, 0) + 1

            # Log progresses
            idx+=1
            if (idx % 500 == 0):
                print("Processing... (", idx/blogCount*100, " %)")

        # Loop all the words and insert into the db
        wordCountLen = len(wordCount);
        for idx,w in enumerate(wordCount):
            line = "INSERT INTO word_dict VALUES('" + w.replace("'","''") + "', " + str(idx) + ", " + str(wordCount[w]) + ", " + str(wordCount[w]/corpusCount) + ", " + str(math.log(corpusCount/wordCount[w])) + ")"
            cur2.execute(line)
            if (idx % 500 == 0):
                print("Insertion... (", idx/wordCountLen*100, " %)")
        
        conn2.commit()

        return wordDict
示例#29
0
"""
doxygen2qtcreator.py : 

This script scans for documented classes inside Doxygen 'html' directory and inserts markers 
used by Qt Creator to generate the tooltip when hovering over a class or method name.

It uses BeautifulSoup4 to parse and modify the html files. 
"""
from __future__ import print_function
import os, sys
import argparse
from bs4 import BeautifulSoup, Comment

# Qt Creator markers

class_brief_start = lambda classname: Comment(" $$$ %s-brief " % classname)
class_brief_end = lambda classname: Comment(" @@@%s " % classname)
method_start = lambda methodname, signature: Comment(" $$$%s[overload1]$$$%s "
                                                     % (methodname, signature))
method_end = lambda methodname: Comment(" @@@%s " % methodname)


def insert_class_markers(soup):
    """ Inserts Qt Creator markers around the class brief paragraph."""

    # look for class name in a div like <div class="title">Namespace::MyClass Class Reference</div>
    title_div = soup.find("div", "title")
    if not title_div:
        raise ValueError(
            "The following div was not found : <div class='title'>...<div>")
示例#30
0
'''

text = 'You are viewing information archived from Mozilla.org on %s.' % (
    datetime.utcnow().strftime("%Y-%m-%d"))

# process every file
for filename in locate("*.html", args.directory):

    with open(filename, "r") as f:
        soup = BeautifulSoup(f)

    if len(soup.select('#archived')) == 0:
        print 'Processing %s' % (filename)

        # get rid of search form
        for s in soup.select('#quick-search'):
            s.replace_with(Comment('search removed'))

        # add styles for notification block
        style = soup.new_tag('style', type='text/css')
        style.append(css)
        soup.head.append(style)

        # add notification block
        div = soup.new_tag('div', id='archived')
        div.append(text)
        soup.body.insert(0, div)

        with open(filename, "w") as f:
            f.write(str(soup))