Пример #1
0
def anchorArticles(txt):
    # find all textnodes starting with Article, wrapping this in a named <a> and prepending a hoverable link to this anchor
    aregex=re.compile('^\s*Article\s+[0-9][0-9.,]*', re.I)
    nsoup = BeautifulSoup(txt)
    node=nsoup.find(text=aregex)
    while node:
        nodeidx=node.parent.contents.index(node)
        match=str(re.match(aregex,node).group())
        # create named <a>
        name=match.replace(' ','_')
        a=Tag(nsoup,'a',[('name',name)])
        a.insert(0,match)
        # create a link that is displayed if the <a> is hovered
        link=Tag(nsoup,'a', [('class',"anchorLink"), ('href','#'+name)])
        link.insert(0,"#")
        # create a container for the a and the link
        hover=Tag(nsoup,'span',[('class','hover')])
        hover.insert(0,a)
        hover.insert(0,link)
        node.parent.insert(nodeidx,hover)
        # cut the newly wrapped from the original node.
        newNode=NavigableString(node[len(match):])
        node.replaceWith(newNode)
        node=newNode.findNext(text=aregex)
    return str(nsoup)
Пример #2
0
def load_scripts(pkg):
    """
    Given a package expand ul#scripts to include the contents of any scripts
    """

    script_ul = SOUP.find("ul", {"id": "scripts"})
    script_ul.contents = []

    for f in os.listdir(pkg):
        if splitext(f)[1] != '.pkg':
            continue

        script_dir = join(pkg, f, 'Scripts')
        script_list = Tag(SOUP, 'ul')

        for script in os.listdir(script_dir):
            if script == "Tools":
                continue

            script_li = Tag(SOUP, 'li')
            script_li['class'] = 'code'
            script_path = join(script_dir, script)

            if isfile(script_path):
                script_li.append(join(f, 'Scripts', script))
                script_li.append(anchor_for_name(script_path))
                script_pre = Tag(SOUP, 'pre')
                script_pre.append(NavigableString(open(script_path).read()))
                script_li.append(script_pre)
            elif isdir(script_path):
                subscript_files = os.listdir(script_path)
                if not subscript_files:
                    continue

                script_li.append("%s Scripts" % join(f, 'Scripts', script))
                subscripts = Tag(SOUP, 'ul')

                for subscript in subscript_files:
                    subscript_path = join(script_path, subscript)
                    subscript_li = Tag(SOUP, 'li')
                    subscript_li.append(subscript)
                    subscript_li.append(anchor_for_name(subscript_path))

                    subscript_pre = Tag(SOUP, 'pre')
                    subscript_pre.append(
                        NavigableString(open(subscript_path).read()))
                    subscript_li.append(subscript_pre)

                    subscripts.append(subscript_li)

                script_li.append(subscripts)

            script_list.append(script_li)

        if script_list.contents:
            new_scripts = Tag(SOUP, 'li')
            new_scripts.append(NavigableString("%s Scripts" % f))
            new_scripts.append(script_list)
            script_ul.append(new_scripts)
Пример #3
0
def generateContentDivTag(baseDir, h3text):
    import __main__

    contentDivTag = Tag(formatSoup, 'div', attrs={'class' : 'content band-content'})
    # 表題埋め込み
    h3tag = Tag(formatSoup, 'h3')
    h3tag.append(NavigableString(h3text))
    contentDivTag.append(h3tag)


    # HTML生成
    for file in os.listdir(PARENT_DIR + baseDir):
        if file.endswith(SHTML_EXT):
            # バンド名ulタグを生成
            progreUlTag = generateUlTag('/' + baseDir, file, 'column')
            albumLiTag = Tag(formatSoup, 'li')
            progreUlTag.append(albumLiTag)

            # 作品名ulタグを生成
            fileSoup = BeautifulSoup(open('/'.join([PARENT_DIR, '/' + baseDir, file])))
            albumList = []
            for albumClassTag in fileSoup.findAll('a', {'class' : 'album-name'}):
                albumList.append(albumClassTag['href'].split('/')[-1])
                __main__.contentCount += 1

            albumDir = '/'.join([baseDir, file.split('.')[0]])
            for album in albumList:
                albumUlTag = generateUlTag('/' + albumDir, album, 'child-column')
                albumLiTag.append(albumUlTag)
            contentDivTag.append(progreUlTag)

    return contentDivTag
Пример #4
0
    def _set(self, topic, key, value, topic_attr=None):
        """Set key and value at topic
        
        :return: success status
        :rtype: bool"""

        # In case it is an empty document
        if not unicode(self._soup).strip().startswith("<?xml"):
            self._soup.insert(0, NavigableString(self.HEADER))

        # In case settings root is not defined
        settings = self._soup.find(self.root)
        if settings is None:
            self._soup.insert(1, Tag(self._soup, self.root))
            settings = self._soup.find(self.root)

        # Add Topic
        topic_tag = self._set_element(settings, topic.lower(), attr=topic_attr)

        if topic_tag is None:
            return False

        # Add key and value
        key_tag = self._set_element(topic_tag, key.lower(), escape(value))
        # Add "" since XML may introduce whitespaces.
        #key_tag = self._set_element(topic_tag, key, '"{0}"'.format(value))

        return key_tag is not None
Пример #5
0
    def _set_element(self, root, tagname, text=None, attr=None):
        """Creates if not available an element at the soup root element
        
        :return: tag object or None
        :rtype: Tag
        """

        # Add Topic if not available
        if attr is None:
            if root.find(re.compile(tagname + "$", re.I)) is None:
                new_tag = Tag(self._soup, tagname)
                root.insert(0, new_tag)
        else:
            if root.find(re.compile(tagname + "$", re.I), attr) is None:
                new_tag = Tag(self._soup, tagname, attr.items())
                root.insert(0, new_tag)

        settings = self._soup.find(self.root)
        tag = settings.find(re.compile(tagname + "$", re.I))

        # Something to insert
        if tag is not None and text is not None:
            if tag.text.strip() == "":
                tag.insert(0, NavigableString(text))
            else:
                tag.contents[0].replaceWith(text)

        return tag
Пример #6
0
def select_calendar(month=None, year=None):
    now = datetime.now()
    day = now.day
    cal = calendar.HTMLCalendar()
    cal.setfirstweekday(6)
    month_table = cal.formatmonth(year, month)
    soup = BeautifulSoup(month_table)
    outfile = open("myHTML.html", 'w')

    for data in soup.findAll('td'):
        if data['class'] != "noday":
            days = data.findAll(text=True)
            for oneday in days:
                day = NavigableString(oneday)
                oneday.extract()
                addatag = Tag(soup, 'input')
                addatag['type'] = "submit"
                addatag['name'] = "meetingday"
                addatag['value'] = day
                data.insert(0, addatag)

    outfile.write(soup.prettify())
    outfile.close()
    infile = open("myHTML.html", 'r')
    calfile = ""
    for line in infile:
        calfile = calfile + line
    infile.close()

    return calfile
Пример #7
0
def replaceJavascript(base_url, soup):
    for js in soup.findAll('script', {'src': re.compile('.+')}):
        try:
            real_js = get_content(resolve_path(base_url, js['src']))
            real_js = real_js.replace('</', 'u003c/')
            js_tag = Tag(soup, 'script')
            js_tag.insert(0, NavigableString(real_js))
            js.replaceWith(js_tag)
        except Exception,e:
            print 'failed to load javascript from %s' % js['src']
            print e
Пример #8
0
 def appendChild(self, node):
     if (node.element.__class__ == NavigableString and self.element.contents
             and self.element.contents[-1].__class__ == NavigableString):
         newNode = TextNode(
             NavigableString(self.element.contents[-1] + node.element),
             self.soup)
         self.element.contents[-1].extract()
         self.appendChild(newNode)
     else:
         self.element.insert(len(self.element.contents), node.element)
         node.parent = self
Пример #9
0
def createParentUlTag(targetSoup):
    parentUlTag = Tag(targetSoup,
                      'ul',
                      attrs={
                          'class': 'xbreadcrumbs',
                          'id': 'breadcrumbs'
                      })
    topListTag = Tag(targetSoup, 'li')
    topAnchorTag = Tag(targetSoup, 'a', attrs={'href': SITE_DOMAIN})
    topAnchorTag.append(NavigableString('TOP'))
    topListTag.append(topAnchorTag)
    parentUlTag.append(topListTag)
    return parentUlTag
Пример #10
0
def generateUlTag(path, file, ulClass):
    # バンド名タグを生成
    fileSoup = BeautifulSoup(open('/'.join([PARENT_DIR, path, file])))
    text = fileSoup.find('h1').renderContents()
    ulTag = Tag(formatSoup, 'ul', attrs={'class' : ulClass})
    liTag = Tag(formatSoup, 'li')
    link = '/'.join([path, file])
    aTag = Tag(formatSoup, 'a', attrs={'href' : link})
    aTag.append(NavigableString(text))
    liTag.append(aTag)
    ulTag.append(liTag)

    return ulTag
Пример #11
0
 def insertBefore(self, node, refNode):
     index = self.element.contents.index(refNode.element)
     if (node.element.__class__ == NavigableString and self.element.contents
             and self.element.contents[index - 1].__class__
             == NavigableString):
         newNode = TextNode(
             NavigableString(self.element.contents[index - 1] +
                             node.element), self.soup)
         self.element.contents[index - 1].extract()
         self.insertBefore(newNode, refNode)
     else:
         self.element.insert(index, node.element)
         node.parent = self
Пример #12
0
 def findEntries(self):
     self.entries = []
     headers = map(lambda x: x.string, self.soup.findAll('h1')[2:])
     table = self.soup.findAll('div', id="bodyContent")[0]
     for table in table.findAll('table'):
         header = True
         for tr in table.findAll('tr'):
             if header:
                 header = False
                 continue
             i = 0
             for th in tr.findAll('td'):
                 description = ''
                 if i == 0:
                     name = ''.join(th.b.findAll(text=True)).replace(
                         ' ', '')
                     anchor = string.capitalize(
                         urllib.quote(name.split('.')[0])) + "."
                     if anchor in headers:
                         url = self.baseURL + 'About:config_entries#' + anchor
                     else:
                         url = self.baseURL + 'About:config_entries'
                 elif i == 1:
                     value = th.text
                 elif i == 2:
                     if value:
                         article = 'a'
                         if value[0] == 'I': article += 'n'
                         optionType = "it accepts " + article + " " + value.lower(
                         ) + "."
                     synopsis = '"' + name + '"'  + ' is a configuration option ' \
                             'for the Firefox web browser; ' + optionType + "<br>"
                     for tag in th.findAll('br'):
                         tag.insert(0, NavigableString("\n"))
                     description = ''.join(th.findAll(text=True))
                     description = description.rstrip().replace(
                         '\n', '<br>').strip()
                     expandedURL = 'href="' + self.baseURL
                     description = description.replace(
                         'href="/', expandedURL)
                     description = re.sub('<\s*b\s*>', '<i>', description)
                     description = re.sub('<\s*/\s*b\s*>', '</i>',
                                          description)
                     description = '<blockquote>' + description + '</blockquote>'
                     description = synopsis + description
                     i = -1
                     self.entries.append(
                         Entry(name, value, description.strip(), url))
                 i += 1
Пример #13
0
    def insertBefore(self, node, refNode):
        index = self._nodeIndex(node, refNode)
        if (node.element.__class__ == NavigableString and self.element.contents
            and self.element.contents[index-1].__class__ == NavigableString):
            # (See comments in appendChild)
            newStr = NavigableString(self.element.contents[index-1]+node.element)
            oldNode = self.element.contents[index-1]
            del self.element.contents[index-1]
            oldNode.parent = None
            oldNode.extract()

            self.element.insert(index-1, newStr)
        else:
            self.element.insert(index, node.element)
            node.parent = self
Пример #14
0
 def getNotes(self, target_language=None):
     notes = self.getTranslatedReferences(fieldname='notesToEditors',
                                          target_language=target_language)
     texts = [note.getText() for note in notes]
     # insert number here...
     for i in range(len(texts)):
         soup = BeautifulSoup(texts[i])
         pTag = soup.p
         if not pTag:
             # can't don anything, continue
             continue
         substr = pTag.contents[0].string
         pTag.contents[0] = NavigableString(
             u'<span class="numbering">%d.</span> ' % (i + 1) + substr)
         texts[i] = str(soup)
     return texts
Пример #15
0
def get_list_for_key(name, children):
    """
    Takes a key and a dictionary containing its children and recursively
    generates HTML lists items. Each item will contain the name and, if it has
    children, an unordered list containing those child items.
    """

    li = Tag(SOUP, "li")
    li.append(NavigableString(name))

    if children:
        ul = Tag(SOUP, "ul")
        for k, v in children.items():
            ul.append(get_list_for_key(k, v))
        li.append(ul)

    return li
Пример #16
0
def backupOriginal():
    fh = open(sourceFile, 'r')
    html = fh.read()
    fh.close()

    soap = BeautifulSoup(html)
    div = soap.find("div", {"class": "tabs2"})
    # Adding menu of letters at the end of navigation bar
    text = NavigableString(createMenu("All"))
    div.append(text)
    #    div.insert(div.__len__(), createMenu("All"))

    html = soap.renderContents()

    output = open(PROJECT_LOCATION + "/doc/html/" + PREFIX + "All.html", "w")
    output.write(html)
    output.close()
Пример #17
0
    def appendChild(self, node):
        if (node.element.__class__ == NavigableString and self.element.contents
            and self.element.contents[-1].__class__ == NavigableString):
            # Concatenate new text onto old text node
            # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
            newStr = NavigableString(self.element.contents[-1]+node.element)

            # Remove the old text node
            # (Can't simply use .extract() by itself, because it fails if
            # an equal text node exists within the parent node)
            oldElement = self.element.contents[-1]
            del self.element.contents[-1]
            oldElement.parent = None
            oldElement.extract()

            self.element.insert(len(self.element.contents), newStr)
        else:
            self.element.insert(len(self.element.contents), node.element)
            node.parent = self
Пример #18
0
def main(pkg_file_name, html_file_name):
    global SOUP

    print "Generating %s from %s" % (html_file_name, pkg_file_name)

    pkg = expand_pkg(pkg_file_name)
    SOUP = BeautifulSoup(open("wtfUpdate.html").read())

    SOUP.find('title').contents = [
        NavigableString("wtfUpdate: %s" % basename(pkg_file_name))
    ]

    try:
        generate_package_report(pkg)
        html_file = open(html_file_name, 'w')
        html_file.write(str(SOUP))
        html_file.close()
    except RuntimeError, exc:
        print >> sys.stderr, "ERROR: %s" % exc
        sys.exit(1)
Пример #19
0
def replaceCss(base_url,soup):
    for css in soup.findAll('link', {'rel':'stylesheet', 'href':re.compile('.+')}):
        try:
            real_css = get_content(resolve_path(base_url, css['href']))

            def replacer(result):
                try:
                    path = resolve_path(resolve_path(base_url,css['href']), result.groups()[0])
                    path = path.replace('"', '').replace("'", "")
                    print path
                    return u'url(%s)' % data_encode_image(path,get_content(path, True))
                except Exception, e:
                    print e
                    return u''

            style_tag = Tag(soup, 'style')
            style_tag.insert(0, NavigableString(re.sub(css_url, replacer, real_css)))
            css.replaceWith(style_tag)

        except Exception, e:
            print 'failed to load css from %s' % css['href']
            print e
Пример #20
0
    def get_converted_html(self, soup):

        #remote note for Chinese charactors: Tag <rt>
        #for nstring in soup.findAll( {'rt' : True}):
        #    nstring.extract()

        #remote tag <ruby> but keep the char inside
        for pTag in soup.findAll({'p': True}):
            #only support calibre-converted ebook, for NOW
            if pTag['class'] != 'calibre':
                continue
            new_content = u''
            for content in pTag.contents:
                if type(content) is NavigableString:
                    new_content = new_content + content
                elif content.name == "ruby":
                    for ruby_char in content.contents:
                        #ignore <rt>
                        if type(ruby_char) is NavigableString:
                            new_content = new_content + ruby_char

            #print(new_content)
            #continue
            # new_content is a sentence. send it to Google translate
            try:
                en_text = self._conversion_engine.convert(new_content)
            except Exception, err:
                en_text = "TRANSLATE ERROR"
                print(err)
                print(new_content)

            if len(new_content) > 0:
                br1 = Tag(soup, "br")
                br2 = Tag(soup, "br")
                idx = len(pTag.contents)
                pTag.insert(idx, br1)
                pTag.insert(idx + 1, NavigableString("&emsp;" + en_text))
                pTag.insert(idx + 2, br2)
    def geo_term_extract(self, desc):
        data = values = {
            'maxRows': '1',
            'fuzzy': '1',
            'country': 'EE',
            'featureClass': 'P',
            'operator': 'OR',
            'username': self.geonames_user,
            'q': desc.encode('utf-8')
        }
        data = urllib.urlencode(values)

        link = u"http://api.geonames.org/search"
        xmldata = urllib.urlopen(link, data)
        soup = BeautifulSoup(xmldata)
        #   print soup.prettify()
        lng = '0'
        lat = '0'
        if len(soup.findAll("lat")) > 0:
            lng = soup.findAll("lng")[0].text
            lat = soup.findAll("lat")[0].text
            lat_f = float(lat)
            lng_f = float(lng)
            lat = '%.5f' % ((lat_f * 10000 + random.uniform(1, 80)) / 10000)
            lng = '%.5f' % ((lng_f * 10000 + random.uniform(1, 80)) / 10000)

        soup2 = BeautifulSoup()
        tag1 = Tag(soup2, "Point")
        tag2 = Tag(soup2, "coordinates")
        soup2.insert(0, tag1)
        tag1.insert(0, tag2)
        text = NavigableString(lng + "," + lat)
        tag2.insert(0, text)
        #   print soup2
        result = (soup2.__str__()).encode("utf-8")
        return [result, lat, lng]
Пример #22
0
def createHtmlPages():

    HTMLHeader = getHeader()
    HTMLFooter = getFooter()

    for list in LINKS:
        letter = list[0]

        html = HTMLHeader

        for item in list[1:]:
            html += item + "\n"

        html += HTMLFooter

        soap = BeautifulSoup(html)
        div = soap.find("div", {"class": "tabs2"})

        text = NavigableString(createMenu(letter))
        div.append(text)

        #        div.insert(div.__len__(), createMenu(letter))

        html = soap.renderContents()

        path = PROJECT_LOCATION + "/doc/html/" + PREFIX + letter + ".html"
        output = open(path, "w")
        output.write(html)
        output.close()

        if letter == "A":
            output = open(sourceFile, "w")
            output.write(html)
            output.close()

        print PROJECT_LOCATION + "/doc/html/" + PREFIX + letter + ".html    Done!"
Пример #23
0
	res = subprocess.check_output(['curl', '-s', url])

	soup = BeautifulStoneSoup(res)
	stations = soup.findAll('station')
	def station_distance_key(station):
		dist = station.find('distance_mi')
		# Airports don't have a distance_mi, calc via lat/long
		if dist==None:
			slat = float(station.find('lat').string)
			slong = float(station.find('lon').string)
			dist = math.sqrt((slat-lat)**2 + (slong-lon)**2)
		else:
			dist = float(dist.string)
		return dist

	stations = sorted(stations, key=station_distance_key)
	station = stations[0]
	if station.parent.name == 'pws':
		api_url = "http://api.wunderground.com/weatherstation/WXCurrentObXML.asp?ID=%s" % urllib.quote(NavigableString.__str__(station.find('id').contents[0].string))
	else:
		api_url = "http://api.wunderground.com/weatherstation/WXCurrentObXML/index.xml?query=%s" % urllib.quote(NavigableString.__str__(station.find('icao').contents[0].string))

	res = subprocess.check_output(['curl', '-s', api_url])
	soup = BeautifulStoneSoup(res)
	res = "%s <fc=#AAAAFF>%iF</fc>" % (soup.find('city').string, int(float(soup.find('temp_f').string)))
	print res
	cache_file.write(res)
	cache_file.close()
except: 
	print "err"
Пример #24
0
def update_testCase_result(src, soup):
    #print src
    localtime = time.localtime()
    updateTime = "%s_%s_%s_%s_%s" % (localtime[0], localtime[1], localtime[2],
                                     localtime[3], localtime[4])
    head = soup.h1
    #update head
    head.contents[0].replaceWith("BU test report %s" % updateTime)
    table_map = {
        "BU sanity test result. URL:": [
            "U6_BU_CI",
        ],
    }

    if not re.search("fp_version", src):
        tc_name = re.search("name=(.*?) ", src).group(1).strip("HZ-").strip()
        verdict = re.search("verdict=(.*?) ", src).group(1).strip()
        assc = re.search("assc=(.*?) ", src).group(1).strip()
        tw = re.search("tw=(.*?) ", src).group(1).strip()
        mgw = re.search("mgw=(.*?) ", src).group(1).strip()
        script = re.search("script=(.*?) ", src).group(1).strip()
        boa = re.search("boa=(.*?) ", src).group(1).strip()
        nelmon = re.search("nelmon=(.*?) ", src).group(1).strip()
        link = re.search("link=(.*)", src).group(1).strip()

        try:
            tc = soup.find(text=tc_name)  #node of text:test case name in soup
            #print tc
            tc.previous['href'] = link  #update link
            verdict_list = tc.parent.parent.findNextSiblings(
                'td', limit=7)  #verdict, tw, nelmon, assc, script, mgw, boa
            #print verdict_list
            #update verdict
            if "PASS" == verdict:
                tc.parent.parent['bgcolor'] = "green"
                verdict_list[0]['bgcolor'] = "green"
                verdict_list[0].contents[0].replaceWith("PASS")
            elif "FAIL" == verdict:
                tc.parent.parent['bgcolor'] = "red"
                verdict_list[0]['bgcolor'] = "red"
                verdict_list[0].contents[0].replaceWith("FAIL")
            elif "WARNING" == verdict:
                tc.parent.parent['bgcolor'] = 'yellow'
                verdict_list[0]['bgcolor'] = 'yellow'
                verdict_list[0].contents[0].replaceWith('WARNING')

            #update TW
            if "PASS" == tw:
                verdict_list[1]['bgcolor'] = "green"
                verdict_list[1].contents[0].replaceWith("PASS")
            elif "FAIL" == tw:
                verdict_list[1]['bgcolor'] = "red"
                verdict_list[1].contents[0].replaceWith("FAIL")

            #update Nelmon
            if "PASS" == nelmon:
                verdict_list[2]['bgcolor'] = "green"
                verdict_list[2].contents[0].replaceWith("PASS")
            elif "FAIL" == nelmon:
                verdict_list[2]['bgcolor'] = "red"
                verdict_list[2].contents[0].replaceWith("FAIL")

            #update assc
            if "PASS" == assc:
                verdict_list[3]['bgcolor'] = "green"
                verdict_list[3].contents[0].replaceWith("PASS")
            elif "FAIL" == assc:
                verdict_list[3]['bgcolor'] = "red"
                verdict_list[3].contents[0].replaceWith("FAIL")

            #update script
            if "PASS" == script:
                verdict_list[4]['bgcolor'] = "green"
                verdict_list[4].contents[0].replaceWith("PASS")
            elif "FAIL" == script:
                verdict_list[4]['bgcolor'] = "red"
                verdict_list[4].contents[0].replaceWith("FAIL")

            #update mgw
            if re.search("PASS", mgw):
                verdict_list[5]['bgcolor'] = "green"
                verdict_list[5].contents[0].replaceWith("PASS")
            elif re.search("FAIL", mgw):
                verdict_list[5]['bgcolor'] = "red"
                verdict_list[5].contents[0].replaceWith("FAIL")
            elif re.search("ALERT|CRITICAL", mgw):
                verdict_list[5]['bgcolor'] = "#800000"
                verdict_list[5].contents[0].replaceWith("CRITICAL")
                tc.parent.parent['bgcolor'] = "#800000"

            #update boa
            if "PASS" == boa:
                verdict_list[6]['bgcolor'] = "green"
                verdict_list[6].contents[0].replaceWith("PASS")
            elif "FAIL" == boa:
                verdict_list[6]['bgcolor'] = "red"
                verdict_list[6].contents[0].replaceWith("FAIL")
        except:
            print "%s haven't been included in BU test cases, please contact with BU team" % tc_name
    else:
        execution_name = re.search("execution=(.*?) ", src).group(1).strip()
        mgw_version = re.search("mgw_version=(.*?)il", src).group(1).strip()
        #il_version = re.search("il_version=(.*?) ", src).group(1).strip()
        #fp_version = re.search("fp_version=(.*?) ", src).group(1).strip()
        #prbs_version = re.search("prbs_version=(.*?) ", src).group(1).strip()
        url = re.search("url=(.*)", src).group(1).strip()

        #since there is "\n" at the end of every line, so need nextSibling 2 times
        #if mgw_version and il_version and fp_version and prbs is NA or empty, then update info. Otherwise, skip.
        #update mgw_version
        MGW = soup.find(text="release lable:")
        if MGW.parent.nextSibling.nextSibling.contents[0] == "NA" or \
        MGW.parent.nextSibling.nextSibling.contents[0] == "":
            MGW.parent.nextSibling.nextSibling.contents[0].replaceWith(
                mgw_version)
        #update il_version
        #IL = soup.find(text="IL version:")
        #if IL.parent.nextSibling.nextSibling.contents[0] == "NA" or \
        #        IL.parent.nextSibling.nextSibling.contents[0] == "":
        #    IL.parent.nextSibling.nextSibling.contents[0].replaceWith(il_version)
        #update fp_version
        #FP = soup.find(text="FP version:")
        #if FP.parent.nextSibling.nextSibling.contents[0] == "NA" or \
        #        FP.parent.nextSibling.nextSibling.contents[0] == "":
        #    FP.parent.nextSibling.nextSibling.contents[0].replaceWith(fp_version)
        #updat prbs_version
        #PRBS = soup.find(text= "PRBs version:")
        #if PRBS.parent.nextSibling.nextSibling.a['href'] == "NA":
        #    PRBS.parent.nextSibling.nextSibling.a['href'] = prbs_version
        #    PRBS.parent.nextSibling.nextSibling.contents[0].contents[0].replaceWith(prbs_version)
        #updat urls for executions
        for k in table_map.keys():
            n = 1
            for i in table_map[k]:
                #if re.search(i, prbs_version): #Use in Open MGW
                if True:  #use in IPA
                    #print k
                    if soup.find(
                            text=re.compile("%s.*" % k)
                    ) == None:  #if update sanity test cases result, go to next execution
                        break
                    node = soup.find(text=re.compile("%s.*" % k)).parent
                    temp_soup = BeautifulSoup()
                    tag = Tag(temp_soup, 'a')
                    text = NavigableString("%s" % url)
                    tag.insert(0, text)
                    tag['href'] = "%s" % url
                    node.insert(n, tag)
                    #print node
                    n = n + 1
Пример #25
0
def mexhelpextract(mexnames):
    #print 'processing mex files: ' + mexnames.__repr__()
    from ConfigParser import RawConfigParser as ConfigParser, Error as error
    for mexname in mexnames:
        # ConfigParser for the three elements per subfunctions written to tmpdir
        # [SubFunction]
        # usage: 'xyz'
        # help: 'xyz'
        # seealso: 'xyz'
        config = ConfigParser({'usage': [], 'help': [], 'seealso': []})
        # assemble command line for matlab
        matlabcmd = 'addpath(\'%s\');%s(\'%s\',\'%s\'); exit' % \
            (_tmpdir, \
             os.path.splitext(os.path.basename(_mexscript))[0], \
             mexname, \
             _tmpdir)
        cmd = 'matlab -nojvm -nodisplay -r "%s" > /dev/null' % matlabcmd
        # and execute matlab w/ the temporary script we wrote earlier
        try:
            print 'running MATLAB for %s in %s' % (mexname, _tmpdir)
            stdin, stderr = os.popen4(cmd)
            print stderr.read()
            stdin.close()
            stderr.close()
        except:
            print 'could not dump help for %s into %s' % (mexname, _tmpdir)

        cfgfile = config.read(os.path.join(_tmpdir, mexname))
        if cfgfile == []:
            print "skipping " + mexname + " (no output)"
            continue
        subfunctions = config.sections()
        print 'processing subfunctions: ' + subfunctions.__repr__()
        for subfunction in subfunctions:
            # read in the strings for this subfunction
            usage = config.get(subfunction, 'usage')
            help = config.get(subfunction, 'help')
            seealso = config.get(subfunction, 'seealso')

            headline = '===[[' + subfunction + ' ' + mexname + '(\'' + subfunction + '\')]]===\n'
            breadcrumb = "==[[Psychtoolbox]] &#8250; [[" \
                                + mexname + "]].{mex*,dll} subfunction==\n\n"

            # scrub the text for main text only
            body = beackern(help)

            docstring = '' \
                    + '%%(matlab;Usage)' \
                    + usage \
                    + '%%\n' \
                    + body \
                    + '\n\n'
            if seealso:
                docstring = docstring + '<<=====See also:=====\n' + seealso + '<<'

            text =  '""' + headline \
                    + breadcrumb \
                    + docstring + '""'

            # retrieve old body text, to update or concatenate with synonymous subfunctions
            #
            # browse the page
            title = re.sub("[^\w]|_", "", subfunction)
            try:
                resp = mech.open(baseurl + title + "/edit")
            except HTTPError, e:
                sys.exit(
                    "retrieving old text during posting of this mex function failed: %d: %s"
                    % (e.code, e.msg))
            # get text from the edit form
            mech.select_form(nr=1)
            try:
                oldbody = mech["body"]
            except:
                print 'No id="body" form. Figure this out first. cf. page text above.'
                for form in mech.forms():
                    print form
                sys.exit(
                    "retrieving old body text failed while processing page: " +
                    baseurl + title + '/edit')

            # parse embedded structuring HTML tags in the wiki text
            soup = BeautifulSoup(oldbody)

            # check if the subfunction is already present, by CSS 'class' and 'id'
            subfct = soup.find('div', {'class': "subfct", 'id': mexname})
            if subfct:
                # replace the text of the container DIV
                subfct.contents[0].replaceWith(text)
            else:
                # contruct new DIV to hold the text
                subfctDIV = Tag(soup, "div")
                subfctDIV['class'] = 'subfct'
                subfctDIV['id'] = mexname
                subfctDIV.insert(0, NavigableString(text))

                # insert the new div
                soup.insert(len(soup), subfctDIV)

            # Now scoop the good well-formed divs out of the soup
            divs = soup('div', {'class': "subfct"})

            # and drop them into fresh yummy cheese soup
            cheesesoup = BeautifulSoup()

            # drop good divs into the soup, one by one
            for div in divs:
                # remove the unneeded style attribute, we finally
                # have this stuff defined in the ptbdocs.css now.
                del (div['style'])
                # escape the HTML tags for wiki parser
                cheesesoup.append(NavigableString('\n""'))
                cheesesoup.append(div)
                cheesesoup.append(NavigableString('""\n'))

            post(subfunction, cheesesoup.renderContents())
def createElement(tagname, contents):
    soup = BeautifulSoup()
    element = Tag(soup, tagname)
    text = NavigableString(contents)
    element.insert(0, text)
    return element
Пример #27
0
def main(argv):
    
    parser = argparse.ArgumentParser(description='Downloads a website into a format suitable for use with phishing frenzy')
    parser.add_argument("site_addr", action="store", help="Site address")
    
    args = parser.parse_args()
    site_addr = args.site_addr
    
    #########################################
    #Get stuff from config file
    #########################################
    config_file = "config/website_cloner.config"
    if os.path.exists(config_file):
        pass
    else:
        try:
            print "Specified config file not found. Copying example config file..."
            shutil.copyfile("config/website_cloner.default", config_file)
        except:
            print "Error copying default config file...quitting execution..."
            sys.exit()
    
    config = ConfigParser.SafeConfigParser()
    config.read(config_file)
    
    try:
        working_dir = config.get("general", "working_dir")
        header_text = config.get("html", "header_text")
        body_text = config.get("html", "body_text")
        
    except:
        print "Missing required config file sections. Check running config file against provided example\n"
        sys.exit()
    
    site_path = site_addr.replace("http://","")
    site_path = site_path.replace("https://","")
    working_dir = os.path.join(working_dir, site_path,'')
    if not os.path.exists(working_dir):
        os.makedirs(working_dir)
    
    os.chdir(os.path.dirname(working_dir))
    
    #########################################
    #Get the site we are cloning
    #########################################    
    
    if not site_addr[:4] == "http":
        site_addr = "http://"+site_addr
        
    try:
        site_text=urllib2.urlopen(site_addr).read()
    except:
        print "Could not open site...quitting..."
        sys.exit()
        
    #soup=BeautifulSoup(header_text+site_text)
    soup=BeautifulSoup(site_text)
    head=soup.find('head')
    head.insert(0,NavigableString(header_text))
    body=soup.find('body')
    body.insert(0,NavigableString(body_text))
    
    ###############################################
    #Detect hyperlinked images and download locally
    ###############################################
    imageList = []
    
    for tag in soup.findAll('img', src=True):
        imageList.append(tag['src'])

    if not imageList:
        pass
    else:
        for url in imageList:
            try:
                filename = url.split('/')[-1].split('#')[0].split('?')[0]
                soup = BeautifulSoup(str(soup).decode("UTF-8").replace(url,filename).encode("UTF-8"))
                
                if not url.startswith('http'):
                    url = urllib2.urlparse.urljoin(site_addr,url)
                print "getting " + url + "..."
                                
                open(filename,"wb").write(urllib2.urlopen(url, timeout=5).read())
            except:
                pass

    cssList = []
    
    for tag in soup.findAll('link', {'rel':'stylesheet'}):
        cssList.append(tag['href'])

    if not cssList:
        pass
    else:
        for url in cssList:
            try:
                filename = url.split('/')[-1].split('#')[0].split('?')[0]
                soup = BeautifulSoup(str(soup).decode("UTF-8").replace(url,filename).encode("UTF-8"))
                
                if not url.startswith('http'):
                    url = urllib2.urlparse.urljoin(site_addr,url)
                print "getting " + url + "..."
                                
                open(filename,"wb").write(urllib2.urlopen(url, timeout=5).read())
            except:
                pass

    scriptList = []
    
    for tag in soup.findAll('script', src=True):
        scriptList.append(tag['src'])

    if not scriptList:
        pass
    else:
        for url in scriptList:
            try:
                filename = url.split('/')[-1].split('#')[0].split('?')[0]
                soup = BeautifulSoup(str(soup).decode("UTF-8").replace(url,filename).encode("UTF-8"))
                
                if not url.startswith('http'):
                    url = urllib2.urlparse.urljoin(site_addr,url)
                print "getting " + url + "..."
                                
                open(filename,"wb").write(urllib2.urlopen(url, timeout=5).read())
            except:
                pass

    ##########################################
    #Clean up html output and make it readable
    ##########################################                               
    mainpage = soup.prettify()
    mainpage = mainpage.replace('&lt;','<')
    mainpage = mainpage.replace('&gt;','>')
    
    open("index.php","wb").write(mainpage)
Пример #28
0
    chart_url += str(value_killed)
    chart_url += ','
    chart_url += str(value_killer)
    chart_url += '&chtt=Twitter+Analysis+Chart'

    #http://chart.apis.google.com/chart?chxl=0:|Policeman+Killed|Killed+by+police&chxs=0,676767,11.5,0,lt,676767&chxt=x&chbh=a,100&chs=300x200&cht=bvg&chco=FF0000&chd=t:30,70&chtt=Twitter+Analysis+Chart

    # Now, create a HTML page with the information
    #  The paga is simple: head with title, body with a big div holding an image (the chart) and 5 additional divs with text
    htmldata = BeautifulSoup()

    htmltag = Tag(htmldata, "html")
    headtag = Tag(htmldata, "head")

    titletag = Tag(htmldata, "title")
    titletag.insert(0, NavigableString('Twitter Stream Analysis Example'))

    bodytag = Tag(htmldata, "body")

    imgtag = Tag(htmldata, "img")
    imgtag['src'] = chart_url

    divtag_wrap = Tag(htmldata, "div")
    divtag_t1 = Tag(htmldata, "div")
    divtag_t1.insert(
        0,
        NavigableString('Total sentences analyzed: ' + str(total_sentences) +
                        ' taken from 400 public tweets'))

    divtag_t2 = Tag(htmldata, "div")
    divtag_t2.insert(
def makeImagesLocal(soup, params):
    """ deal with internal and external image references """

    for img in soup.findAll('img'):
        # 'internal' images are marked with class="internal resource"
        # in order to prevent image fetching later on
        if 'internal-resource' in (img.get('class') or ''):
            continue

        src = img['src']
        if params['request'] and src.startswith(params['request'].BASE0) \
            and '++resource++' not in src:
            src = src.replace(params['request'].BASE0 + '/', '')

        if src.startswith('http'):
            try:
                img_data = urllib2.urlopen(str(src)).read()

            except urllib2.URLError:
                LOG.warn('No image found: %s - removed from output' % src)
                img.extract()
                continue

            tmpname = tempfile.mktemp(dir=params['destdir'])
            file(tmpname, 'wb').write(img_data)
            img['src'] = os.path.basename(tmpname)

        else:
            # image with relative URL

            # first lookup image by direct traversal
            img_path = urllib.unquote(str(src))
            img_obj = params['context'].restrictedTraverse(img_path, None)
            if img_obj is None:
                img_path2 = getToolByName(
                    params['context'], 'portal_url').getPortalPath() + img_path
                img_obj = params['context'].restrictedTraverse(img_path2, None)

            if img_obj is None and 'resolveuid' in src:
                mo = uid_reg.search(src)
                if mo:
                    uid = mo.group(0)
                    img_obj = params['context'].reference_catalog.lookupObject(
                        uid)

            # For scaled images ('_preview', '_large' etc.) use the original
            # image always (which is stored as acquisition parent)
            if img_obj:
                has_portal_type = hasattr(aq_base(img_obj.aq_inner),
                                          'portal_type')
                if has_portal_type and img_obj.portal_type == img_obj.aq_parent.portal_type:
                    img_obj = img_obj.aq_parent

            if img_obj is None:
                # nothing found, check the next parent node with a 'path' parameter
                # referring to the origin document
                parent_container_path = pathFromParent(soup, img)
                if parent_container_path is not None:
                    img_obj = params['context'].restrictedTraverse(
                        '%s/%s' % (parent_container_path, img_path), None)

            # still nothing found
            if img_obj is None:

                img_split = img_path.split('/')
                if img_split[-1].startswith(
                        'image_') or img_split[-1].startswith('image-'):
                    img_path = '/'.join(img_split[:-1])
                for image_path in params['images']:
                    if image_path.endswith(img_path):
                        img_obj = params['context'].restrictedTraverse(
                            image_path, None)
                        break

            # get hold of the image in original size
            if img_obj:
                # thumbnails have an Image as aq_parent
                if img_obj.aq_parent.portal_type == 'Image':
                    img_obj = img_obj.aq_parent

            if img_obj:
                img_data = None
                for attr in ['data', '_data']:
                    try:
                        img_data = str(getattr(img_obj, attr))
                        continue
                    except AttributeError:
                        pass
                if img_data == None:
                    LOG.warn('No image found: %s - removed from output' %
                             img_path)
                    img.extract()
                    continue

                tmpname = tempfile.mktemp(dir=params['destdir'])
                file(tmpname, 'wb').write(img_data)
                img['src'] = os.path.basename(tmpname)

                # image scaling
                try:
                    scale = img_obj.getField('pdfScale').get(img_obj)
                except AttributeError:
                    scale = 100

                # add content-info debug information
                # don't add scale as style since the outer image-container
                # has the style set
                img['scale'] = str(scale)

                # now move <img> tag into a dedicated <div>
                div = Tag(soup, 'div')
                div['class'] = 'image-container'
                #                div['style'] = 'width: %d%%' % scale
                div['scale'] = str(scale)
                div.insert(0, copy.copy(img))

                # image caption
                img_description = img_obj.Description()
                img_caption = Tag(soup, 'div')
                img_caption['class'] = 'image-caption'

                # exclude from image enumeration
                context = params['context']
                exclude_field = img_obj.getField('excludeFromImageEnumeration')
                if exclude_field and not exclude_field.get(img_obj):
                    span = Tag(soup, 'span')
                    classes = ['image-caption-text']
                    description = img_obj.Description()
                    if description:
                        classes.append('image-caption-text-with-text')
                    else:
                        classes.append('image-caption-text-without-text')
                    span['class'] = ' '.join(classes)
                    if description:
                        span.insert(0, NavigableString(description))
                    img_caption.insert(0, span)
                    div.append(img_caption)

                img.replaceWith(div)

            else:
                LOG.warn('No image found: %s - not removed, keeping it' %
                         img_path)
Пример #30
0
 def insertText(self, data, insertBefore=None):
     text = TextNode(NavigableString(data), self.soup)
     if insertBefore:
         self.insertBefore(text, insertBefore)
     else:
         self.appendChild(text)
Пример #31
0
def main():
    """Create an XML database containing a word from the GNT, its PROIEL ID # and other PROIEL info."""

    aligned = codecs.open("aligned-gospels.wds", "rU", "utf-8")

    xml = codecs.open("proiel-GNT.xml", "rU", "utf-8")

    print "Parsing the PROIEL XML with BeautifulStoneSoup..."
    print

    proiel = BeautifulStoneSoup(xml)

    tokens = proiel.findAll('token')

    tok_dict = {}

    # creating a dictionary keyed by PROIEL IDs to speed up searching
    for token in tokens:
        tok_dict[token['id']] = token

    output = open("gospels-database.xml", "w")

    print >> output, "<div>"

    print >> output, "<title>Gospels</title>"

    count = 100001

    soup = BeautifulStoneSoup()

    word = Tag(soup, "word")

    print "Iterating through the alignment file..."
    print

    for line in aligned:
        stuff = line.split("\t")
        word = Tag(soup, "word")
        form = NavigableString(stuff[0])
        word.insert(0, form)
        # make it so that the IDs count up from 000000, not 100000
        word['id'] = str(count).replace("1", "0", 1)
        word['proiel-id'] = stuff[1]

        # adding attributes from the PROIEL XML
        if stuff[1] != "000000" and stuff[1] != "999999" and stuff[1] != "111111":
            token = tok_dict[stuff[1]]
            morph = token['morph-features'].split(",")
            word['lemma'] = morph[0]
            word['proiel-pos'] = morph[1]
            word['lang'] = morph[2]
            word['morph'] = morph[3]
            word['deprel'] = token['relation']
            try:
                word['proiel-head-id'] = token['head-id']
            except KeyError:
                word['proiel-head-id'] = "root"
        word['proiel-form'] = stuff[2].rstrip()
        count += 1
        print >> output, word

    print >> output, "</div>"

    print "Done!"
    print