Пример #1
0
 def paragraph_tokens(self, para, remove_people=False):
     for ii in tokens(self.paras_[para]):
         if remove_people and ii in self.people_tokens_:
             continue
         else:
             yield ii
     return
Пример #2
0
    def parse_tree(self, raw):
        self.title_ = [x.text for x in raw.findall("head/title")]
        if len(self.title_) >= 1:
            self.title_ = self.title_[0]
        else:
            self.title_ = ""

        self.date_ = dict([(DATE_TYPES[x.attrib["name"]],
                            int(x.attrib["content"])) \
                               for x in raw.findall("head/meta") \
                               if x.attrib["name"] in DATE_TYPES])
        for ii in DATE_TYPES:
            for jj in raw.findall(ii):
                self.date_[DATE_TYPES[ii]] = int(jj.text)

        self.byline_ = [x.text for x in raw.findall("body/body.head/byline")
                        if x.attrib["class"] == "normalized_byline"]

        # there are other entities (such as orgs and locations), but
        # we're not dealing with them for now
        try:
            self.people_ = set([pp.text.upper().split("(")[0].strip() for pp in
                                raw.findall("head/docdata/" +
                                            "identified-content/person")
                                if pp.text not in self.byline_])
        except AttributeError:
            self.people_ = []

        self.people_tokens_ = []
        for ii in self.people_:
            self.people_tokens_ += tokens(ii)

        self.people_tokens_ = set(self.people_tokens_)

        body = [x for x in raw.findall("body/body.content/block")
                if x.attrib["class"] == "full_text"]

        for ii in raw.findall("txt"):
            body.append(ii)

        self.paras_ = []
        for ii in body:
            if "<p>" in ii.text:
                for jj in ii.text.split("<p>"):
                    self.paras_.append(jj.replace("</p>", ""))
            else:
                for pp in ii.findall("p"):
                    self.paras_.append(pp.text)