Python BeautifulStoneSoup.BeautifulStoneSoup示例，bs4.BeautifulStoneSoup.BeautifulStoneSoup Python示例

示例#1

0

显示文件

文件： books.py 项目： handuoZhang/ee-book

    def open(self, book_id=None):
        if book_id:
            self.book_id = book_id
        if not self.book_id:
            raise Exception('Book id not set')

        self.size = os.path.getsize(self._FILE % (self.book_id, self.book_id))
        sz_mult = 1.0 / (1024**2)
        result = u'%.1f' % (self.size * sz_mult)
        self.size = u'<0.1' if result == u'0.0' else result

        self.f = zipfile.ZipFile(self._FILE % (self.book_id, self.book_id),
                                 'r')
        soup = BeautifulStoneSoup(self.f.read('META-INF/container.xml'))

        oebps = soup.findAll('rootfile')[0]['full-path']
        folder = oebps.rfind(os.sep)
        self.oebps_folder = '' if folder == -1 else oebps[:folder +
                                                          1]  # 找到oebps的文件夹名称

        oebps_content = self.f.read(oebps)
        self.read_doc_props(oebps_content)

        opf_bs = BeautifulStoneSoup(oebps_content)
        ncx = opf_bs.findAll('item', {'id': 'ncx'})[0]
        ncx = self.oebps_folder + ncx['href']  # 找到ncx的完整路径

        ncx_bs = BeautifulStoneSoup(self.f.read(ncx))

        self.chapters = [
            (nav.navlabel.text, nav.content['src'])
            for nav in ncx_bs.findAll('navmap')[0].findAll('navpoint')
        ]
        self.cover_href = self.chapters[0][1]  # 封面路径

示例#2

0

显示文件

 def __call__(self, content: str) -> str:
     filenames = []
     soup = BeautifulStoneSoup(content)
     links = soup.table.findAll('a')
     for link in links:
         filenames.append(link.text)
     return '\n'.join(filenames)

示例#3

0

显示文件

def HTMLEntitiesToUnicode(text):
    """
    Converts HTML entities to unicode.  For example '&amp;' becomes '&'.
    """
    text = BeautifulStoneSoup(text,
                              convertEntities=BeautifulStoneSoup.ALL_ENTITIES)
    return text

示例#4

0

显示文件

	def trigger_w(self, msg):
		"Usage: w <search term>. Prints a short description of the corresponding wikipedia article."
		if len(msg.args) == 0:
			self.bot.notice(msg.nick, "Please specify a search term")
			return

		params = {
			'action': 'opensearch',
			'format': 'xml',
			'limit': '2',
			'search': ' '.join(msg.args)
		}
		url = 'http://{:s}.wikipedia.org/w/api.php'.format(self.language)

		response = BeautifulStoneSoup(requests.post(url, data=params).text)

		# Damn BS4 is case sensitive, hence all the regex.
		if response.find(re.compile('text', re.I)):
			index = 0
			if "may refer to:" in response.find(re.compile('description', re.I)).string:
				index = 1

			info = response.find_all(re.compile('description', re.I))[index].string.strip()
			url = response.find_all(re.compile('url', re.I))[index].string

			short_url = self.shorten(url)

			message = u"\002Wikipedia ::\002 {} \002::\002 {}".format(info, short_url)
			self.bot.privmsg(msg.channel, message)
		else:
			self.bot.privmsg(msg.channel, "{}: no articles were found.".format(' '.join(msg.args)))

示例#5

0

显示文件

    def fuseReferences(doc, ref):
        """
        """
        prevref = doc["references"][-1]
        doc["metadata"]["ref_replace_list"] = doc["metadata"].get(
            "ref_replace_list", {})
        id = ""
        try:
            id = ref["id"]
            if not id:
                id = prevref["id"]
                if isinstance(id, six.string_types):
                    id = "ref" + str(len(doc["references"]) + 1)
                elif isinstance(id, int):
                    id = id + 1
        except:
            id = "ref" + str(len(doc["references"]) + 1)

        doc["metadata"]["ref_replace_list"][id] = prevref["id"]
        doc["references"].remove(prevref)

        fullstring = re.sub(r"</reference>", "", prevref["xml"], 0,
                            re.IGNORECASE)
        fullstring += re.sub(r"<reference.+?>", "", ref.__repr__(), 0,
                             re.IGNORECASE)
        ##                ref=BeautifulStoneSoup(prevref["xml"]+ref.__repr__())
        ref = BeautifulStoneSoup(fullstring).find("reference")
        processReferenceXML(ref, doc, False)

示例#6

0

显示文件

文件： read_jatsxml.py 项目： danduma/minerva

    def loadJATSSentence(self, s, newDocument, par_id, section_id):
        """
            Loads a JATS sentence (ready split)

            :param s: the plain text of the sentence (with all tags inside, e.g. <xref>)
            :param newDocument: SciDoc
            :param par_id: id of the paragraph containing this sentence
            :param section_id: id of the section containing the paragraph
        """
        newSent = newDocument.addSentence(par_id, "")
        s_soup = BeautifulStoneSoup(s)

        refs = s_soup.findAll("xref", {"ref-type": "bibr"})
        citations_found = []
        for r in refs:
            citations_found.extend(
                self.loadJATSCitation(r,
                                      newSent["id"],
                                      newDocument,
                                      section=section_id))

        non_refs = s_soup.findAll(
            lambda tag: tag.name.lower() == "xref" and "ref-type" in tag and
            tag["ref-type"].lower() != "bibr")
        for nr in non_refs:
            nr.name = "inref"

        newSent["citations"] = [acit["id"] for acit in citations_found]
        # TODO replace <xref> tags with <cit> tags
        newSent["text"] = newDocument.extractSentenceTextWithCitationTokens(
            s_soup, newSent["id"])
        ##            print(newSent["text"])
        # deal with many citations within characters of each other: make them know they are a cluster
        # TODO cluster citations? Store them in some other way?
        newDocument.countMultiCitations(newSent)

示例#7

0

显示文件

 def test_beautifulstonesoup_is_xml_parser(self):
     # Make sure that the deprecated BSS class uses an xml builder
     # if one is installed.
     with warnings.catch_warnings(record=True) as w:
         soup = BeautifulStoneSoup("<b />")
     self.assertEqual("<b/>", str(soup.b))
     self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))

示例#8

0

显示文件

 def parse(self, content: str) -> str:
     """Parses web content"""
     filenames = []
     soup = BeautifulStoneSoup(content)
     links = soup.table.findAll('a')
     for link in links:
         filenames.append(link['href'])
     return '\n'.join(filenames)

示例#9

0

显示文件

 def __init__(self):
     try:
         # Получение xml в string формате
         with req.urlopen(self.URL) as open_url:
             soup = BeautifulStoneSoup(open_url.read())
             self.FEED = (series.series_from_xml(soup, 'item'))[2:]
     except Exception as e:
         logger.error("Cannot get a XML-file: %s" % e)

示例#10

0

显示文件

文件： __init__.py 项目： alex-min/ebaysdk-python

    def response_soup(self):
        "Returns a BeautifulSoup object of the response."

        if not self._response_soup:
            self._response_soup = BeautifulStoneSoup(
                str(self._response_content, encoding='utf-8'))

        return self._response_soup

示例#11

0

显示文件

文件： ParcerBody.py 项目： chmelevme/somework

 def get_all_urls(self):
     """Возвращает список url"""
     list_of_urls = []
     for url in self.urls:
         request = self.session.get(url, headers=self.headers)
         soap = BeautifulStoneSoup(request.content)
         urls = soap.find_all('loc')
         list_of_urls += [url.next_element for url in urls]
     return list_of_urls

示例#12

0

显示文件

    def render(self, context):
        fancount = ''
        fb_api_url = 'http://api.facebook.com/restserver.php'
        tw_api_url = 'http://api.twitter.com'

        cache_key = ''
        cache_time = 1800

        if self.service == "facebook":
            query = '%s?method=facebook.fql.query&query=SELECT%%20fan_count%%20FROM%%20page%%20WHERE%%20page_id=%s'
            xml_path = query % (fb_api_url, self.service_id)
            cache_key = md5(xml_path.encode()).hexdigest()
            fancount = cache.get(cache_key)
            if not fancount:
                try:
                    xml = urlopen(xml_path)
                    content = xml.read()
                    soup = BeautifulStoneSoup(content)
                    nodes = soup.find_all('page')
                    for node in nodes:
                        fancount = node.fan_count.string
                    cache.set(cache_key, fancount, cache_time)
                except:
                    pass

        if self.service == "twitter":
            query = "%s/1/users/show/%s.xml"
            xml_path = query % (tw_api_url, self.service_id)
            cache_key = md5(xml_path.encode()).hexdigest()
            fancount = cache.get(cache_key)
            if not fancount:
                try:
                    xml = urlopen(xml_path)
                    content = xml.read()
                    soup = BeautifulStoneSoup(content)
                    nodes = soup.find_all('user')
                    for node in nodes:
                        fancount = node.followers_count.string
                    cache.set(cache_key, fancount, cache_time)
                except:
                    pass

        return fancount

示例#13

0

显示文件

文件： read_cermine.py 项目： danduma/minerva

    def parseCermineXML(self, xml_string):
        """
            This is meant to load the full output from Cermine, whichever it may be.
            Currently only reads references.
        """
        soup = BeautifulStoneSoup(xml_string, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
##        print(xml_string)
        references=self.readReferences(soup)
        # TODO implement reading the rest of the Cermine/ParsHed tagging
        return references

示例#14

0

显示文件

文件： read_jatsxml.py 项目： danduma/minerva

    def read(self, xml, identifier):
        """
            Load a JATS/NLM (PubMed) XML into a SciDoc.

            :param xml: full xml string
            :type xml: basestring
            :param identifier: an identifier for this document, e.g. file name
                        If an actual full path, the path will be removed from it
                        when stored
            :type identifier: basestring
            :returns: :class:`SciDoc <SciDoc>` object
            :rtype: SciDoc
        """
        # this solves a "bug" in BeautifulStoneSoup with "sec" tags
        BeautifulStoneSoup.NESTABLE_TAGS["sec"] = []
        #xml=fixNumberCitationsXML(xml)
        soup = BeautifulStoneSoup(xml)

        # Create a new SciDoc to store the paper
        newDocument = SciDoc()
        metadata = newDocument["metadata"]
        metadata["filename"] = os.path.basename(identifier)
        metadata["original_citation_style"] = detectCitationStyle(xml)

        body = soup.find("body")
        if not body:
            # TODO: Make the error handling less terrible
            debugAddMessage(newDocument, "error",
                            "NO <BODY> IN THIS PAPER! file: " + identifier)
            newDocument["metadata"]["guid"] = cp.Corpus.generateGUID()
            return newDocument

        # Load metadata, either from corpus or from file
        self.loadJATSMetadataFromPaper(newDocument, soup)
        metadata["guid"] = cp.Corpus.generateGUID(metadata)

        # Load all references from the XML
        back = soup.find("back")
        if back:
            ref_list = back.find("ref-list")
            # other things in <back> like appendices: ignore them for now
            if ref_list:
                for ref in ref_list.findAll("ref"):
                    self.loadJATSReference(ref, newDocument)

        newDocument.updateReferences()

        # Load Abstract
        self.loadJATSAbstract(soup, newDocument)

        for sec in body.findChildren("sec", recursive=False):
            self.loadJATSSection(sec, newDocument, "root")

        newDocument.updateAuthorsAffiliations()
        return newDocument

示例#15

0

显示文件

def soup_maker(fh):
    """ Takes a file handler returns BeautifulSoup"""
    try:
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(fh, "lxml")
        for tag in soup.find_all():
            tag.name = tag.name.lower()
    except ImportError:
        from bs4 import BeautifulStoneSoup
        soup = BeautifulStoneSoup(fh)
    return soup

示例#16

0

显示文件

 def login(self):
     """ Read greeting """
     greeting = self.read()
     soup = BeautifulStoneSoup(greeting, 'lxml')
     svid = soup.find('svid')
     version = soup.find('version')
     print("Connected to %s (v%s)\n" % (svid.text, version.text))
     """ Login """
     xml = commands.login % self.config
     if not self.cmd(xml, silent=True):
         exit(1)

示例#17

0

显示文件

文件： TVSeriesUtil.py 项目： bopopescu/backup_200115

    def getDetailsForSerieByID(self, serieName, serieID):
        url = SERIE_DETAILS_URL % (urllib.quote(serieID))

        try:
            # Change the User Agent
            USER_AGENT = 'Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10'

            cj = cookielib.CookieJar()
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

            req = urllib2.Request(url)
            req.add_header('User-Agent', USER_AGENT)

            resp = opener.open(req)

            soup = BeautifulStoneSoup(resp.read())
            resp.close()

            for banner in soup.banners.findAll('banner'):
                if banner.language.string == 'en':
                    if not 'Fanart' in KNOWN_SHOWS[serieName].keys(
                    ) and banner.bannertype.string == 'fanart':
                        KNOWN_SHOWS[serieName]['Fanart'] = str(
                            BANNER_URL % (banner.bannerpath.string))
                        if banner.thumbnailpath:
                            KNOWN_SHOWS[serieName]['FanartThumb'] = str(
                                BANNER_URL % (banner.thumbnailpath.string))
                    elif not 'Poster' in KNOWN_SHOWS[serieName].keys(
                    ) and banner.bannertype.string == 'poster':
                        KNOWN_SHOWS[serieName]['Poster'] = str(
                            BANNER_URL % (banner.bannerpath.string))
                        if banner.thumbnailpath:
                            KNOWN_SHOWS[serieName]['PosterThumb'] = str(
                                BANNER_URL % (banner.thumbnailpath.string))
                    elif not 'Season' in KNOWN_SHOWS[serieName].keys(
                    ) and banner.bannertype.string == 'season':
                        KNOWN_SHOWS[serieName]['Season'] = str(
                            BANNER_URL % (banner.bannerpath.string))
                        if banner.thumbnailpath:
                            KNOWN_SHOWS[serieName]['SeasonThumb'] = str(
                                BANNER_URL % (banner.thumbnailpath.string))
                    elif not 'Series' in KNOWN_SHOWS[serieName].keys(
                    ) and banner.bannertype.string == 'series':
                        KNOWN_SHOWS[serieName]['Series'] = str(
                            BANNER_URL % (banner.bannerpath.string))
                        if banner.thumbnailpath:
                            KNOWN_SHOWS[serieName]['SeriesThumb'] = str(
                                BANNER_URL % (banner.thumbnailpath.string))

            return KNOWN_SHOWS[serieName]
        except:
            print 'Error: ' + url
            return None

示例#18

0

显示文件

    def response_soup(self):
        "Returns a BeautifulSoup object of the response."
        try:
            from bs4 import BeautifulStoneSoup
        except ImportError:
            from BeautifulSoup import BeautifulStoneSoup
            log.warn('DeprecationWarning: BeautifulSoup 3 or earlier is deprecated; install bs4 instead\n')

        if not self._response_soup:
            self._response_soup = BeautifulStoneSoup(
                self._response_content.decode('utf-8')
            )

        return self._response_soup

示例#19

0

显示文件

文件： TweetResults.py 项目： AdamSwenson/TwitterProject

 def HTMLEntitiesToUnicode(self, text):
     """
     Converts HTML entities to unicode.  For example '&amp;' becomes '&'.
     Args:
         text: HTML laden text to convert to unicode
     Returns:
         String converted to unicode
     """
     try:
         text = str(BeautifulStoneSoup(text, convertEntities=BeautifulStoneSoup.ALL_ENTITIES))
         return text
     except Exception as e:
         print("error formatting string: %s ; Errors:  %s" % text, e)
         return None

示例#20

0

显示文件

def extract_info_from_html():
    from bs4 import BeautifulStoneSoup
    import re
    html = open("data/url.html").read()
    soup = BeautifulStoneSoup(html)
    inputTag = soup.findAll("a")
    inputTag = str(inputTag).split(",")
    m = [re.search(" +href=\"(.*?)\"", i) for i in inputTag]
    urls = [i.group(1) for i in m]

    code = [
        i[9:-9].replace("<", "")
        for i in str(soup.findAll('strong')).split(",")
    ]
    city = [
        i.split('<span class="uni-code">')[0].replace("\t", "").replace(
            "</span>", "").replace("\n", "") for i in html.split(
                '<i class="fa fa-map-marker" aria-hidden="true"></i>')[1:]
    ]
    abbr = [
        i.split('</div>')[0].replace("\t", "").replace("</span>",
                                                       "").replace("\n", "")
        for i in html.split('<div class="name-group">')[1::2]
    ]

    # ADD CODE TO UNI_INFO
    map_abbr_code = [{
        "abbr": m,
        "code": n
    } for m, n in zip(abbr, code) if m != ""]
    import json
    uni = json.load(open("data/university.json"))

    len(uni)
    new_uni = []
    abbrs = []
    for i in uni:
        if (i["abbr"] in abbrs):
            continue
        else:
            for j in map_abbr_code:
                if (j["abbr"] == i["abbr"]):
                    i["code"] = j["code"]
                    break
            new_uni.append(i)
            abbrs.append(i["abbr"])

    with open('data/university_add_code.json', 'w') as outfile:
        json.dump(new_uni, outfile, ensure_ascii=False, indent=4)

示例#21

0

显示文件

文件： NavigatorParcer.py 项目： chmelevme/somework

    def parse_data(self, url):
        '''Собирает данные в словарь'''
        request = self.session.get(url, headers=self.headers)
        if request.status_code == 200:
            soup = BeautifulStoneSoup(request.content)
            if not (bool(soup.find('div', {"class": 'error404__text'}))
                    or bool(soup.find('div', {"class": 'nothing-search'}))
                    or bool(soup.find('div', {"id": 'productList'}))):

                try:
                    name_of_product = soup.find('h1').next_element
                except Exception:
                    raise Format_Exeption('name', url)

                try:
                    price_for_all = soup.find(
                        'span', {
                            "class": "item__price item__price--normal-left"
                        }).next_element.replace(" ", "").replace("\n", "")
                except Exception:
                    price_for_all = "Нет в наличии"
                try:
                    price_for_registered = soup.find(
                        'span', {
                            "class": "item__price item__price--red-bold"
                        }).next_element.replace(" ", "").replace("\n", "")
                except Exception:
                    price_for_registered = "Нет в наличии"

                try:
                    reference = soup.findAll(
                        'div', {"class": "item__card-info-articul"})
                    reference = reference[1].next_element
                    reference = str(reference).split()[2].replace("-", '')
                except Exception:
                    reference = "Нет номера"
                final = {
                    "name_of_product": name_of_product,
                    "price_for_all": price_for_all,
                    "price_for_registered": price_for_registered,
                    "reference": reference,
                    "url": url
                }
                return final
            else:
                print("Не тот формат, вот ссылка {0}".format(url))
                raise Format_Exeption
        else:
            raise Connection_Exception

示例#22

0

显示文件

文件： MarsParcer.py 项目： chmelevme/somework

    def parse_data(self, url):
        '''Собирает данные в словарь'''
        request = self.session.get(url, headers=self.headers)
        if request.status_code == 200:
            soap = BeautifulStoneSoup(request.content)
            if not (bool(soap.find('table', {"class": 'map-columns'})) or bool(
                    soap.find('div', {"class": 'col-md-12 catalog-items'}))):
                try:
                    name_of_product = soap.find('h1', {
                        'class': 'title'
                    }).next_element
                except Exception:
                    raise Format_Exeption('name', url)

                try:
                    price_for_all = soap.find('div', {
                        "class": "price"
                    }).next_element.replace(" ", "").replace("\n", "")[:-1]
                except Exception:
                    price_for_all = "Нет в наличии"
                try:
                    price_for_rozn = soap.find('div', {
                        "class": "rozn-price"
                    }).next_element.replace(" ", "").replace("\n", "")[:-1]
                    price_for_rozn = ''.join(
                        filter(str.isdigit, price_for_rozn))
                except Exception:
                    price_for_rozn = "Нет в наличии"
                try:
                    reference = soap.find('div', {
                        'class': 'article'
                    }).next_element.replace("-", '')[9:]
                except Exception:
                    reference = "Нет номера"

                final = {
                    "name_of_product": name_of_product,
                    "price_for_all": price_for_all,
                    "price_for_registered": price_for_rozn,
                    "reference": reference,
                    "url": url
                }
                return final
            else:
                print("Не тот формат, вот ссылка {0}".format(url))
                raise Format_Exeption
        else:
            raise Connection_Exception

示例#23

0

显示文件

文件： az_features.py 项目： danduma/minerva

def loadRefAuthorsFromSentence(sentence):
    """
        Converts <refauthor> tags to proper citations
    """
    cfc_citations = []
    for match in re.findall(r"<refauthor.*?</refauthor>", sentence["text"],
                            re.IGNORECASE):
        soup = BeautifulStoneSoup(match)
        new_cit = {"parent_s": sentence["id"]}

        avoid_list = {"links"}
        for key in [key[0] for key in soup.attrs if key[0] not in avoid_list]:
            new_cit[key] = soup[key]

        cfc_citations.append(new_cit)
    return cfc_citations

示例#24

0

显示文件

    def login(self, account, password):
        self.count += 1
        form_data = {
            '__LASTFOCUS': '',
            '__VIEWSTATE':
            '/wEPDwUKLTYyODEyMzMzMGRkL3e45wwAbXRMklziclTsgEdzyPEwTYlRK/82rSW9ia4=',
            '__EVENTTARGET': '',
            '__EVENTARGUMENT': '',
            '__EVENTVALIDATION':
            '/wEdAAU54OdiNscYklAhFFRo5mKvR1LBKX1P1xh290RQyTesRe73C5Hghb+Z/bZTMreJjC5a26FEoUIR27AQFJNWWcL9lvD3Xdq7ldHy+JQ2tBNJGaOgZ5o+9oUn7QAVYx4o/XgeS3eF3mvkRXiWGnMfaCgO',
            'UserName': account,
            'UserPass': password,
            'CheckCode': self.get_capture(),
            'Btn_OK': '(unable to decode value)',
        }

        data = urllib.parse.urlencode(form_data).encode(encoding='utf-8')
        resquest = urllib.request.Request(self.login_url, data, self.headers)
        response = self.opener.open(resquest)
        login_content = response.read().decode(self.character).encode("utf-8")
        ss = str((BeautifulStoneSoup(login_content)))
        '''
         if (len(ss)) > 4105:
              ss = ss.split('alert')[1]
              ss = ss.split(';')[0]
              print (ss)
              if ss == "('验证码输入错误！')":
                  if self.count is 1: 
                      self.cookie = self.cookie.clear()
                  return 1
              elif ss == "('用户名或密码错误！')":
                   #print ("用户名错误\n")
                   self.cookie = self.cookie.clear()
                   return 2
        '''
        #print(ss)
        if re.findall('验证码输入错误.', ss):
            print('验证码输入错误')
            if self.count is 1:
                self.cookie = self.cookie.clear()
                return 1
        elif re.findall('用户名或密码错误.', ss):
            print('用户名或密码错误')
            self.cookie = self.cookie.clear()
            return 2
        else:
            return 3

示例#25

0

显示文件

文件： athar_corpus.py 项目： danduma/minerva

    def read(self, xml, filename):
        """
            Load a document from the Athar corpus

            Args:
                xml: full xml string
        """
        ##        # this solves a "bug" in BeautifulStoneSoup with "sec" tags
        ##        BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[]

        soup = BeautifulStoneSoup(xml)

        paper_data_node = soup.find("div", {"class": "dstPaperData"})
        paper_data = {
            "id": paper_data_node.text,
            "title": "",
            "authors": "",
        }
        title = paper_data_node.find("div", {"class": "dstPaperTitle"})
        if title:
            paper_data["title"] = title.text

        authors = paper_data_node.find("div", {"class": "dstPaperAuthors"})
        if authors:
            author_chunks = title.text.split(";")
            for author in author_chunks:
                chunks = author.split(",")
                author_dict = {"given": chunks[1], "family": chunks[0]}
            paper_data["authors"] = author_dict


##        print(paper_data)

        all_contexts = []
        all_docs = []
        document_nodes = soup.findAll("table", {"class": "srcPaper"})
        for index, document_node in enumerate(document_nodes):
            try:
                doc, contexts = self.loadDocumentNode(document_node,
                                                      paper_data, index)
                all_docs.append(doc)
                all_contexts.extend(contexts)
            except ValueError:
                print("Error:", sys.exc_info()[1])
                break
        return all_docs, all_contexts

示例#26

0

显示文件

文件： tasks.py 项目： MohammedRashidKP/tencenci

    def run(self, file_name, user, **kwargs):
        """
        Parse the given xml file using BeautifulSoup. Save all Article, Redirect and Page objects.
        """
        f = open(file_name, 'r')
        xml = f.read()
        f.close()

        soup = BeautifulStoneSoup(xml)
        items = soup.find_all('item')

        for item in items:
            post_type = item.find('wp:post_type').string
            post_status = item.find('wp:status').string

            if post_type == 'attachment':
                get_media(item, user)
                # Note! This script assumes all the attachments come before
                # posts and pages in the xml. If this ends up changing,
                # do two loops, one with attachments and the second with posts and pages.
            elif post_type == 'post' and post_status == 'publish':
                get_posts(item, user)
            elif post_type == 'page' and post_status == 'publish':
                get_pages(item, user)

        if user.email:
            context = {
                'SITE_GLOBAL_SITEDISPLAYNAME':
                get_setting('site', 'global', 'sitedisplayname'),
                'SITE_GLOBAL_SITEURL':
                get_setting('site', 'global', 'siteurl'),
            }
            subject = ''.join(
                render_to_string(
                    template_name=('notification/wp_import/short.txt'),
                    context=context).splitlines())
            body = render_to_string(
                template_name=('notification/wp_import/full.html'),
                context=context)

            #send_mail(subject, body, settings.DEFAULT_FROM_EMAIL, [user.email], fail_silently=False)
            email = EmailMessage(subject, body, settings.DEFAULT_FROM_EMAIL,
                                 [user.email])
            email.content_subtype = 'html'
            email.send(fail_silently=True)

示例#27

0

显示文件

def parse_metadata(filename):
    dims = [
        None,
    ] * len(labels)

    with open(filename, "r") as myfile:
        xml = BeautifulStoneSoup(myfile.read())

    maxvals = []
    for channel in xml.Channels:
        maxvals.append(float(channel["Max"]))

    for setting in xml.ScannerSetting:
        for i, l in enumerate(labels):
            if setting["Description"] == l:
                dims[i] = float(setting["Variant"])

    return maxvals, dims

示例#28

0

显示文件

 def cmd(self, cmd, silent=False):
     self.write(cmd)
     data = self.read()
     soup = BeautifulStoneSoup(data, 'lxml')
     response = soup.find('response')
     result = soup.find('result')
     try:
         code = int(result.get('code'))
     except AttributeError:
         print("\nERROR: Could not get result code, exiting.")
         exit(1)
     if not silent or code not in (1000, 1300, 1500):
         print("- [%d] %s" % (code, result.msg.text))
     if code == 2308:
         return False
     if code == 2502:
         return False
     return response

示例#29

0

显示文件

    def get_info(self, account):
        request = urllib.request.Request(self.info_url)
        response = self.opener.open(request)
        content = response.read().decode(self.character).encode("utf-8")

        file = open('new/' + account + '.html', 'wb')
        file.write(content)
        file.close()

        detail_html = BeautifulStoneSoup(content)
        img_url = detail_html.find(id="Student11_Image1")
        link = img_url.get('src')
        link = link[2:]
        pto_url = 'http://szjy.swun.edu.cn/Sys/SystemForm' + link
        pto_url = pto_url.replace('照片', '%D5%D5%C6%AC')
        urllib.request.install_opener(opener=self.opener)
        img_name = 'photos/' + account + '.jpg'
        urllib.request.urlretrieve(pto_url, img_name)
        self.cookie = self.cookie.clear()

示例#30

0

显示文件

 def check_updates(self):
     logger.info("Start checking updates")
     try:
         xml_file = BeautifulStoneSoup(req.urlopen(self.URL).read())
         if xml_file:
             series_list = series.series_from_xml(xml_file)
             updates = [
                 elem for elem in series_list if elem not in self.FEED
             ]
             if len(self.FEED) == 12:
                 self.FEED = updates + self.FEED[:-len(updates)]
             else:
                 self.FEED = updates + self.FEED
             updates.reverse()
             return updates
         else:
             return []
     except Exception as e:
         logger.error("Check failed : %s" % e)
         return []