def paragraph_tokens(self, para, remove_people=False): for ii in tokens(self.paras_[para]): if remove_people and ii in self.people_tokens_: continue else: yield ii return
def parse_tree(self, raw): self.title_ = [x.text for x in raw.findall("head/title")] if len(self.title_) >= 1: self.title_ = self.title_[0] else: self.title_ = "" self.date_ = dict([(DATE_TYPES[x.attrib["name"]], int(x.attrib["content"])) \ for x in raw.findall("head/meta") \ if x.attrib["name"] in DATE_TYPES]) for ii in DATE_TYPES: for jj in raw.findall(ii): self.date_[DATE_TYPES[ii]] = int(jj.text) self.byline_ = [x.text for x in raw.findall("body/body.head/byline") if x.attrib["class"] == "normalized_byline"] # there are other entities (such as orgs and locations), but # we're not dealing with them for now try: self.people_ = set([pp.text.upper().split("(")[0].strip() for pp in raw.findall("head/docdata/" + "identified-content/person") if pp.text not in self.byline_]) except AttributeError: self.people_ = [] self.people_tokens_ = [] for ii in self.people_: self.people_tokens_ += tokens(ii) self.people_tokens_ = set(self.people_tokens_) body = [x for x in raw.findall("body/body.content/block") if x.attrib["class"] == "full_text"] for ii in raw.findall("txt"): body.append(ii) self.paras_ = [] for ii in body: if "<p>" in ii.text: for jj in ii.text.split("<p>"): self.paras_.append(jj.replace("</p>", "")) else: for pp in ii.findall("p"): self.paras_.append(pp.text)