示例#1
0
 def DetectDidnotparticipate(self, gnv, vlabsent):
     adtext = re.sub("</?i>", "", self.tlcall[self.i].paratext)
     mnotparticipate = re.match(
         "(.*?)\s*did not participate in the voting.", adtext)
     if mnotparticipate:
         assert len(vlabsent) == 1
         if vlabsent[0] == mnotparticipate.group(1):
             self.i += 1
         else:
             print "nonparcitipation name wrong", vlabsent[
                 0], mnotparticipate.group(1)
             #if self.undocname != "S-PV-4305":
             raise unexception("mismatch nonparticipationvoting",
                               self.tlcall[self.i].paranum)
     else:
         msubvote = re.match(
             "\[Subsequently.*? (Jamaica) .*? voted? in (favour)", adtext)
         if msubvote:
             nat = msubvote.group(1)
             gnv[nat] = "%s-%s" % (gnv[nat], msubvote.group(2))
             self.i += 1
         elif len(vlabsent) != 0:
             if self.undocname not in [
                     "S-PV-3412", "S-PV-3413", "S-PV-3407", "S-PV-3409"
             ]:  # cases where Rwanda is absent
                 raise unexception("unaccounted nonparticipationvoting",
                                   self.tlcall[self.i].paranum)
示例#2
0
    def DetectSubsequentVoteChange(self, gnv):
        adtext = re.sub("</?i>", "", self.tlcall[self.i].paratext)
        msubseq = re.match("\[Subsequently,? (.*?)\.?\]", adtext)
        self.votechanges = {}
        if not msubseq:
            if re.search("Subsequently", adtext):
                print adtext
                raise unexception("unexpected subsequently",
                                  self.tlcall[self.i].paranum)
            return

        for sadtext in re.split(";\s*", msubseq.group(1)):
            if not sadtext:
                continue
            msadtext = re.match(
                "the delegations? of (.*?) (?:informed|advised) the [Ss]ecretariat that (?:it|they) (?:had )?intended to (vote in favour|vote against|abstain)$",
                sadtext)
            if not msadtext:
                msadtext = re.match(
                    "the delegations? of (.*?)(?:(?:informed|advised) the Secretariat that (?:it|they))? had (not) intended to participate(?: in the voting)?$",
                    sadtext)
            if not msadtext:
                msadtext = re.match(
                    "the delegations? of (.*?) had intended to (vote in favour|vote against|abstain)$",
                    sadtext)
            if not msadtext:
                print "---%s---" % sadtext
                #print re.match("the delegations? of (.*?) (?:informed|advised) the Secretariat that (?:it|they) had", sadtext)
                raise unexception("change vote advice unrecognized",
                                  self.tlcall[self.i].paranum)

            mess, natlist, carryforward = self.DetectNationList(
                msadtext.group(1), "ANDLIST", self.tlcall[self.i].paranum)
            assert natlist
            #print sadtext, msadtext.group(1)
            #print natlist, "(", msadtext.group(2)
            assert not carryforward
            for nat in natlist:
                assert nat not in self.votechanges
                if re.search("favour", msadtext.group(2)):
                    vch = "favour"
                elif re.search("against", msadtext.group(2)):
                    vch = "against"
                elif re.search("abstain", msadtext.group(2)):
                    vch = "abstain"
                elif re.search("not", msadtext.group(2)):
                    vch = "absent"
                else:
                    assert False
                self.votechanges[nat] = vch

        self.votechange = adtext
        self.i += 1
        for nat in self.votechanges:
            gnv[nat] = "%s-%s" % (gnv[nat], self.votechanges[nat])
示例#3
0
    def ExtractDateTime(self, txline, ltext):
        # extract the date out if poss
        mdate = re.match(
            "\w+\s*, (\d+)\s+(\w+)\s+(\d+),\s*(?:at )?(\d+)[\.:]?(\d*)(?:\s+([ap])\.?\s*m\.?| noon\.?)?(?: \(closed\))?$",
            ltext)
        if not mdate:  #Tuesday, 3 December 2002, 10 a.m.
            if re.search("Friday", ltext) and IsNotQuiet():
                print ltext, re.match(
                    "\w+\s*, (\d+)\s+(\w+)\s+(\d+),\s*(?:at )?(\d+)[\.:]?(\d*)(?:\s+([ap])\.?m\.?| noon\.?)?(?: \(closed\))?",
                    ltext)
            return

        #print txlines[ih].ltext
        iday = int(mdate.group(1))
        if mdate.group(2) not in months:
            if IsNotQuiet():
                print mdate.group(2), months
            raise unexception(
                "unrecognized month",
                paranumC(txline.undocname, None, 0, -1,
                         txline.textcountnumber))
        imonth = months.index(mdate.group(2))
        syear = mdate.group(3)
        if not re.match("(?:20\d\d|19\d\d)$", syear):
            raise unexception(
                "bad year",
                paranumC(txline.undocname, None, 0, -1,
                         txline.textcountnumber))
        ihour = int(mdate.group(4))
        imin = mdate.group(5) and int(mdate.group(5)) or 0
        if mdate.group(6) and mdate.group(6) == "a" and ihour == 12:
            ihour = 0
        elif mdate.group(6) and mdate.group(6) == "p" and ihour != 12:
            ihour += 12
        if self.date:
            raise unexception(
                "date redefined",
                paranumC(txline.undocname, None, 0, -1,
                         txline.textcountnumber))
        if not (0 <= ihour <= 23) or not (0 <= imin <= 59):
            if IsNotQuiet():
                print ltext
            raise unexception(
                "bad time",
                paranumC(txline.undocname, None, 0, -1,
                         txline.textcountnumber))
        self.date = "%s-%02d-%02d %02d:%02d" % (syear, imonth + 1, iday, ihour,
                                                imin)
示例#4
0
    def DetectNationList(self, ptext, fromlast, paranum):
        bforce = (not not fromlast)
        if fromlast and fromlast not in ["FIRST", "ANDLIST"]:
            print "carryingforward $%s$" % fromlast
            ptext = "%s %s" % (fromlast, ptext)
        if ptext == "None":
            return "presentcomplete", [], None
        ptext = re.sub("</?i>", "", ptext)
        votelist = [
            c.strip() for c in re.split("[,\.]", ptext)
            if not re.match("\s*$", c)
        ]

        if fromlast == "ANDLIST":
            #print "vvvv", votelist
            assert votelist
            if not FixNationName(votelist[-1], self.sdate):
                mand = re.search("(.*?) and (.*)$", votelist[-1])
                if mand:
                    votelist[-1] = mand.group(1)
                    votelist.append(mand.group(2))

        if re.match("<i>", ptext):
            if bforce:
                print fromlast, bforce, ptext
                assert False
            return "nothingmore", -1, -1

        if votelist and not FixNationName(votelist[-1], self.sdate) and (
                ptext[-1] != ",") and fromlast != "ANDLIST":
            carryforward = votelist[-1]
            votelist = votelist[:-1]
        else:
            carryforward = None

        res = []
        fres = []
        for lnation in votelist:
            nation = FixNationName(lnation, self.sdate)
            if not nation and fromlast == "ANDLIST" and re.match(
                    "[Tt]he ", lnation):
                nation = FixNationName(lnation[4:], self.sdate)
            if nation:
                if nation != "INVALID":
                    res.append(nation)
            else:
                fres.append(lnation)
        if bforce and fres:
            print votelist
            print "****", fres
            print "cccccc", carryforward
            raise unexception("votelist problem", self.tlcall[self.i].paranum)
        if res and not fres:
            return "present", res, carryforward
        if bforce:
            assert not fres
            return "presentblank", res, ""  # the "In favour" is followed by a new page
        #print fres
        return "nothingmore", -1, -1
示例#5
0
    def ExtractDotLineChair(self, txlines, ih):
        assert self.pageno == 1
        #<text top="334" left="185" width="584" height="17" font="2">Mr.  Kavan  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . (Czech Republic)</text>
        while True:
            #print "------" + txlines[ih].ltext
            mchair = re.search("([^>:]*?)\s*\. \. \. \. \.", txlines[ih].ltext)
            if mchair:
                break

            # fix missing year date
            #if self.undocname == "A-55-PV.44" and txlines[ih].ltext == "Monday, 30 October, 10 a.m.":
            #    txlines[ih].ltext = "Monday, 30 October 2000, 10 a.m."
            self.ExtractDateTime(txlines[ih], txlines[ih].ltext)

            ih += 1
            if ih == len(txlines):
                return -1

        if not self.date:
            if IsNotQuiet():
                for i in range(ih):
                    print "--%s--" % txlines[i].ltext
            raise unexception(
                "dotlinechair date problem",
                paranumC(txlines[ih].undocname, None, 0, -1,
                         txlines[ih].textcountnumber))
            assert False

        # when country name for the president . . . . is not on same line
        mcountry = re.search("\((.*?)\)$", txlines[ih].ltext)
        if not mcountry:
            ih += 1
            #print txlines[ih].ltext
            mcountry = re.match("\((.*?)\)$", txlines[ih].ltext)
            if not mcountry:
                if IsNotQuiet():
                    print txlines[ih].ltext
                raise unexception(
                    "unable to extract country from  ...-line",
                    paranumC(txlines[ih].undocname, None, 0, -1,
                             txlines[ih].textcountnumber))
        ih += 1
        chairname = re.sub("\s\s+", " ", mchair.group(1)).strip()
        self.chairs.append(
            (chairname, FixNationName(mcountry.group(1), self.date)))
        return ih
示例#6
0
 def DetectAdoption(self):
     adtext = re.sub("</?i>", "", self.tlcall[self.i].paratext)
     madtext = re.search(
         "(adopted|carried|retained.*?|rejected)(?:, as amended,|, as a whole,)?\s+by(?: votes)?\s+(\d+)(?:\s+votes)? to\s+(\d+|none)(?:,? with (\d+) abstentions?)?",
         adtext)
     if not madtext:
         madtext = re.match(
             "(By)\s+(\d+)(?:\s+votes)? to\s+(\d+|none)(?:,? with (\d+) abstentions?)?",
             adtext)
     if not madtext:
         print "--%s-- %d" % (adtext, self.i)
         raise unexception("by votes problem", self.tlcall[self.i].paranum)
     ifavour = int(madtext.group(2))
     iagainst = (madtext.group(3) != "none" and int(madtext.group(3)) or 0)
     #if madtext.group(1) == "rejected":
     #    i = ifavour;  ifavour = iagainst;  iagainst = i
     iabstain = (madtext.group(4) and int(madtext.group(4)) or 0)
     if madtext.group(1) == "rejected":
         il = (iagainst, ifavour, iabstain)
     else:
         il = (ifavour, iagainst, iabstain)
     ivl = (len(self.vlfavour), len(self.vlagainst), len(self.vlabstain))
     if il != ivl:
         if IsNotQuiet():
             print "wrong-count", self.undocname, il, ivl
         # wrong values are found on A-57-PV.73 s(favour=154, 152)
         if self.undocname not in [
                 "A-56-PV.82",
                 "A-57-PV.73",
                 "A-58-PV.54",
                 "A-52-PV.69",
                 "A-50-PV.90",
                 "A-49-PV.83",
         ]:
             raise unexception("wrong votecount",
                               self.tlcall[self.i].paranum)
     self.motiontext = MarkupLinks(adtext, self.undocname, self.paranum)
     self.i += 1
示例#7
0
def DetectAgendaForm(ptext, genasssess, prevagendanum, paranum):
    if re.match("Agenda(?: items?)? \d+(?i)", ptext):
        blinepara = "boldline-agenda"
        acptext = re.sub(
            "(?:<i>\s*\(|\(\s*<i>|\()\s*(?:continued|resumed)\s*(?:\)\s*</i>|</i>\s*\)|\))|<i>\s*</i>",
            " ", ptext).strip()
        acptext = re.sub("\(\w\)|;.*$", "", acptext)
        acptext = re.sub("agenda items?(?i)", " ", acptext)
        acptext = re.sub("and", ", ", acptext)
        if not re.match("[\d\s,]+$", acptext):
            print ptext
            raise unexception("malformed boldline agenda", paranum)
        res = ",".join(
            ["%s-%s" % (aa, genasssess) for aa in re.findall("\d+", acptext)])
        assert res
        return res

    mprovag = re.match(
        "Items? (\d+)(?: and (\d+)?)?(?: \(\w\))? of the provisional agenda",
        ptext)
    if mprovag:
        res = "%sp-%s" % (mprovag.group(1), genasssess)
        if mprovag.group(2):
            res = "%s,%sp-%s" % (res, mprovag.group(1), genasssess)
        return res

    mreqreopen = re.match("Request for the reopening.*?agenda item (\d+)",
                          ptext)
    if mreqreopen:
        return "%s-%s" % (mreqreopen.group(1), genasssess)

    if re.match("\(\w\)", ptext):
        if not re.match("\d+-\d+", prevagendanum):
            print "can't copy from prevagendanum", prevagendanum
            return ""
        assert prevagendanum.split("-")[1] == genasssess
        #print "\n\n\ncontinuingagendanum", prevagendanum, ptext
        return prevagendanum

    for agt, reagt in AgendaTypeMap:
        if re.search(reagt, ptext):
            if agt == "condolence":
                print "NNNN", ptext
            return "%s-%s" % (agt, genasssess)

    print "\n\n****  ", ptext
    print genasssess
    #assert not re.search("Agenda", ptext), ptext
    return ""
示例#8
0
def CleanupTags(ptext, typ, paranum):
    assert typ in [
        "council-agenda", "italicline", "italicline-tookchair",
        "italicline-spokein", "boldline", "spoken"
    ]
    if typ == "boldline":
        ptext = re.sub("</?b>", "", ptext).strip()
    if re.search("<b>", ptext):
        ptext = re.sub("<b>([.,]\s*)</b>", "\\1", ptext)

    # slipt in a cleaning substitution here (can't find a better place for now)
    #ptext = re.sub("[u'`\u017d']", "'", ptext)  # this one doesn't work
    ptext = re.sub(u'[\xad]', "-",
                   ptext)  # some very invisibley different symbol

    # could have a special paragraph type for this
    mspokein = re.match(
        "\((spoke in \w+(.*?delegation|President's Office)?)\)$", ptext)
    if mspokein:
        stext = re.sub("<[/ib]*>", "", mspokein.group(1)).strip()
        return "<i>%s</i>" % stext

    if re.search("<[^/i]+>", ptext):
        print ptext
        raise unexception("tag other than italics in text", paranum)
    if re.match("<i>.*?</i>[\s\.\-]?$", ptext):
        print ptext
        raise unexception("total italics in text", paranum)
    if re.search("</?i>", "".join(re.split("<i>(.*?)</i>", ptext))):
        print ptext
        raise unexception("unmatched italics in spoken text", paranum)
    if re.search("\s\S\s\S\s\S\s", ptext):
        print ptext
        raise unexception("probable gaps in text", paranum)

    return ptext
示例#9
0
    def DetectVote(self, votere):
        tlc = self.tlcall[self.i]
        votem = re.match(votere, tlc.paratext)
        if not votem:
            # missing abstain column case
            bAftervote = re.match(
                "<i>\s*(?:The )?[Dd]raft|<b>\s*The President|<i>\s*Operative paragraph|<i>.*?did not participate",
                tlc.paratext)
            if bAftervote and re.search("Abstain", votere):
                # and self.undocname in ["A-53-PV.81", "A-55-PV.103", "A-55-PV.83", "A-55-PV.86", "A-56-PV.105", "A-56-PV.68", "A-56-PV.82", "A-56-PV.86", "A-57-PV.57", "A-57-PV.66", "A-57-PV.77", "A-58-PV.55", "A-58-PV.72"]:
                return []
            if bAftervote and re.search("Against",
                                        votere) and self.bSecurityCouncil:
                return []
            if re.search("Against|Abstaining", votere) and re.search(
                    "Subsequently", tlc.paratext) and self.bSecurityCouncil:
                return []
            if self.undocname in ["A-55-PV.44"] and re.search(
                    "Against", votere) and re.match("<i>Abstaining",
                                                    tlc.paratext):
                return []
            if re.search("Against", votere) and re.match(
                    "<i>Abstaining:?</i>", tlc.paratext):
                return []
            print "failed with:", votere, tlc.paratext
            raise unexception("votelist detectvote match", tlc.paranum)

        #print tlc.paratext
        mess, natlist, carryforward = self.DetectNationList(
            tlc.paratext[votem.end(0):].strip(), "FIRST", tlc.paranum)
        assert mess != "nothingmore"
        self.i += 1

        # deal with nation names merging across pages.
        while True:
            mess, cnatlist, carryforward = self.DetectNationList(
                self.tlcall[self.i].paratext, carryforward,
                self.tlcall[self.i].paranum)
            if mess == "nothingmore":
                #print self.tlcall[self.i].paratext
                break
            natlist.extend(cnatlist)
            self.i += 1
        return natlist
示例#10
0
    def __init__(self, txline, lundocname, lpageno, textcountnumber):
        mxline = re.match(
            '<text top="(\d+)" left="(\d+)" width="-?(\d+)" height="(\d+)" font="(\d+)">(.*?)</text>',
            txline)
        if not mxline:
            print txline, "tttttt"
        self.top = int(mxline.group(1))
        self.left = int(mxline.group(2))
        self.width = int(mxline.group(3))
        self.height = int(mxline.group(4))
        self.font = int(mxline.group(5))
        self.pageno = lpageno
        self.undocname = lundocname
        self.textcountnumber = textcountnumber
        self.ltext = mxline.group(6).strip()

        self.ltext = re.sub("<i>\s*</i>|<b>\s*</b>", " ", self.ltext)
        if re.match("<[ib]>\s*</[ib]>|\s*$", self.ltext):
            self.ltext = ""

        # will be removed
        if not self.ltext:
            return

        self.bfootertype = (self.left < 459
                            and self.left + self.width > 459) or re.match(
                                footertext, self.ltext)
        #if self.bfootertype:
        #    print self.ltext

        # move on any short bits that are like 13^(th)
        if self.height == 11 and not self.bfootertype and self.width <= 10:
            #print self.left, self.width, "'%s'" % self.ltext
            assert self.width <= 10
            if self.ltext not in ["th", "rd", "st", "nd"]:
                if IsNotQuiet():
                    print self.ltext
                raise unexception(
                    "unrecognized shortbit",
                    paranumC(self.undocname, None, 0, -1,
                             self.textcountnumber))
            self.top += 2  # push the step down from 16 to 18
示例#11
0
def AppendToCluster(txlcol, txl):

    # frig the indentation on the most common mistakes
    if re.match(
            "<i>The meeting (?:was called|was suspended|rose at|was resumed)",
            txl.ltext) and (txl.indent == 0):
        txl.indent = 31

    if not txlcol:
        txlcol.append(TextLineCluster(txl))
        return
    txl.vgap = txl.top - txlcol[-1].txls[-1].top

    #print txlcol[-1].txls[-1].ltext
    #print txl.vgap, txl.width, txl.height, txl.top,  txl.ltext  # zzzz

    # frig vgaps in some cases where the spacing was wider than normal
    if txl.undocname in ["A-50-PV.84", "A-50-PV.88"]:
        if txl.vgap == 21 or txl.vgap == 22:
            txl.vgap = 18
        if txl.vgap == 42:
            txl.vgap = 43
    if txl.undocname == "S-PV-5584":
        if txl.vgap == 20:
            txl.vgap = 19

    if not txl.vgap in familiarvgaps:
        if IsNotQuiet():
            print "\n\n   vgap=", txl.vgap, "\n\nwidth/height/top", txl.width, txl.height, txl.top, txl.ltext  # zzzz
            print " familiar vgaps:", familiarvgaps
        raise unexception(
            "vgap not familiar",
            paranumC(txl.undocname, None, 0, -1, txl.textcountnumber))
    if txl.vgap in (0, 17, 18, 19) or txl.vgap == 0:
        txlcol[-1].AddLine(txl)
    else:
        #print txl.vgap, "vvvv", txl.ltext
        txlcol.append(TextLineCluster(txl))
示例#12
0
    def __init__(self, tlcall, i, lundocname, lsdate, seccouncilmembers):
        self.tlcall = tlcall
        self.i = i
        self.sdate = lsdate
        self.undocname = lundocname
        self.bSecurityCouncil = re.match("S-PV.\d+", self.undocname)
        self.bGeneralAssembly = re.match("A-\d+-PV", self.undocname)
        assert self.bGeneralAssembly or self.bSecurityCouncil
        if not self.bSecurityCouncil:
            seccouncilmembers = None

        self.pageno, self.paranum = tlcall[i].txls[0].pageno, tlcall[i].paranum

        vtext = re.sub("</?i>", "", tlcall[self.i].paratext).strip()
        if self.bGeneralAssembly and re.match(
                "A recorded vote has been requested(?: for this item| on (?:the|this) motion|\. We shall now begin the voting process)?\.?$",
                vtext):
            self.i += 1
            vtext = re.sub("</?i>", "", tlcall[self.i].paratext).strip()
        if self.bGeneralAssembly and re.match(
                "A recorded vote was taken\s*\.?$", vtext):
            self.i += 1
        if self.bSecurityCouncil and re.match(
                "A vote was taken(?: by (?:a )?show of hands)?.$", vtext):
            self.i += 1

        if not (self.i != i or self.undocname
                in ["A-55-PV.86", "A-50-PV.90", "A-49-PV.90"]):
            print "--%s--" % tlcall[self.i - 1].paratext
            if not re.match("<i>", tlcall[self.i - 1].paratext):
                print "  --[should this line be italic?]"
            print tlcall[self.i].paratext
            raise unexception("requested vote not followed through",
                              tlcall[self.i].paranum)

        self.vlfavour = self.DetectVote("<i>In favour:?\s*</i>:?")
        self.vlagainst = self.DetectVote("(?:<i>)?Against:?\s*(?:</i>)?:?")
        self.vlabstain = self.DetectVote("(?:<i>)?Abstaining:?(?:</i>)?:?")
        gnv, self.vlabsent = GenerateNationsVoteList(self.vlfavour,
                                                     self.vlagainst,
                                                     self.vlabstain,
                                                     self.sdate, self.paranum,
                                                     seccouncilmembers)
        self.votecount = "favour=%d against=%d abstain=%d absent=%d" % (len(
            self.vlfavour), len(self.vlagainst), len(
                self.vlabstain), len(self.vlabsent))
        if IsNotQuiet():
            print "  ", self.votecount
        if self.bGeneralAssembly:
            self.DetectAdoption()
            self.DetectSubsequentVoteChange(gnv)
        if self.bSecurityCouncil:
            self.motiontext = ""
            self.DetectDidnotparticipate(gnv, self.vlabsent)

        #res = [ '\t\t<div style="border:1px solid black; margin-left:2em"><b>VOTE ', votecount, "</b><br>\n", "\t\t<i>", self.motiontext, "</i>\n" ]
        #res.append('\t\t<div style="font-size:6">')
        lvotelist = []
        for nation, vote in sorted(gnv.items()):
            lvotelist.append('<span class="%s">%s</span>' % (vote, nation))
        self.votelist = ", ".join(lvotelist)
        #res.append("</div></div>\n")
        #self.parafout = "".join(res)
        self.typ = "vote"
示例#13
0
    def ExtractSeccounFrontPage(self, txlines):
        self.date = None
        self.chairs = []
        self.seccouncilmembers = []
        self.agenda = []

        lasttop = -1
        jtxlines = []
        ih = 0
        while ih < len(txlines):
            if txlines[ih].top == lasttop:
                jtxlines[-1] = "%s %s" % (jtxlines[-1], txlines[ih].ltext)
            else:
                jtxlines.append(txlines[ih].ltext)
                lasttop = txlines[ih].top
            ih += 1

        del txlines  # just deletes the reference to this object
        ih = 0
        while ih < len(jtxlines):
            self.ExtractDateTime(None, jtxlines[ih])
            mpresseat = re.match(
                "<i>(President|Chairman|later)(?:</i>:|:\s*</i>)\s*((?:Mr.|Mrs.|Ms.|Sir\.?|Miss|Sheikh|Baroness|Lord|Nana) .*?)\s+\.(?: \.)*\s*(\(.*)?$",
                jtxlines[ih])
            #print jtxlines[ih], mpresseat
            if mpresseat:
                if not self.date:
                    if IsNotQuiet():
                        for i in range(ih):
                            print jtxlines[i]
                    raise unexception(
                        "missingg date",
                        paranumC(self.undocname, None, 0, -1,
                                 self.textcountnumber))
                if mpresseat.group(1) in ["President", "Chairman"]:
                    assert len(self.chairs) == 0  # first one
                else:
                    assert len(self.chairs) == 1  # later president
                ih += 1
                if mpresseat.group(3):
                    scountry = mpresseat.group(3)
                else:
                    scountry = ""
                if re.search("\(", scountry) and not re.search("\)", scountry):
                    scountry = "%s %s" % (scountry, jtxlines[ih])
                    ih += 1
                mcountry = re.match("\((.*?)\)$", scountry)
                lfscountry = re.sub("\s+", " ", mcountry.group(1))
                fscountry = FixNationName(lfscountry, self.date)
                if not fscountry:
                    if IsNotQuiet():
                        print "--%s--" % mcountry.group(1)
                    raise unexception(
                        "unrecognized nationA",
                        paranumC(self.undocname, None, 0, -1,
                                 self.textcountnumber))
                chairname = re.sub("\s\s+", " ", mpresseat.group(2)).strip()
                self.chairs.append((chairname, fscountry, "president"))

                if fscountry in self.seccouncilmembers:
                    assert len(self.seccouncilmembers) == 1
                    assert fscountry == "New Zealand"
                    assert self.undocname == "S-PV-3370"
                    assert len(self.chairs) == 2
                    del self.chairs[0]
                    del self.seccouncilmembers[0]

                self.seccouncilmembers.append(fscountry)
                continue

            mcountryseat = re.match(
                "(<i>Members(?:</i>:|:\s*</i>))?\s*([\w\-\s]*?)\s*\.(?: \.)*\s*((?:Mr.|Ms.|Mrs.|Miss|Dr.|Sir\.?|Sheikh|Baroness|Lord|Nana) [^<>]*|absent)$",
                jtxlines[ih])
            if mcountryseat:
                if mcountryseat.group(1):
                    if len(self.chairs) not in [
                            1, 2
                    ]:  # in case of second president
                        if IsNotQuiet():
                            print self.chairs, "chchchch"
                        raise unexception(
                            "chairs not thereB",
                            paranumC(self.undocname, None, 0, -1,
                                     self.textcountnumber))
                else:
                    if len(self.chairs) == 0:
                        if not self.date:  # prob a closed meeting
                            break
                        if IsNotQuiet():
                            print ih, jtxlines[ih]
                        raise unexception(
                            "seat without chair",
                            paranumC(self.undocname, None, 0, -1,
                                     self.textcountnumber))
                lfscountry = re.sub("\s+", " ", mcountryseat.group(2))
                fscountry = FixNationName(lfscountry, self.date)
                if not fscountry:
                    if IsNotQuiet():
                        print "--%s--" % mcountryseat.group(2)
                    raise unexception(
                        "unrecognized nationB",
                        paranumC(self.undocname, None, 0, -1,
                                 self.textcountnumber))
                chairname = re.sub("\s\s+", " ", mcountryseat.group(3)).strip()
                self.chairs.append((chairname, fscountry, "member"))
                if fscountry not in self.seccouncilmembers:
                    self.seccouncilmembers.append(fscountry)
                else:
                    if IsNotQuiet():
                        print "Repeat-country on council", fscountry
            else:
                if re.search(" \. \. \. \. \. \. ", jtxlines[ih]):
                    if IsNotQuiet():
                        print "--%s--" % jtxlines[ih]
                    raise unexception(
                        "missing country",
                        paranumC(self.undocname, None, 0, -1,
                                 self.textcountnumber))
            if re.match("<b>Agenda\s*</b>$", jtxlines[ih]):
                ih += 1
                break
            if re.search("Agenda", jtxlines[ih]):
                print ih, jtxlines
                raise unexception(
                    "unextracted Agenda (should be <b>?)",
                    paranumC(self.undocname, None, 0, -1,
                             self.textcountnumber))
            ih += 1

        # could be a closed meeting
        if not self.date:
            alltext = " ".join(jtxlines)
            if re.search(
                    "OFFICIAL COMMUNIQU..*?Held in private (?:in the Security Council Chamber )?at Headquarters(?i)",
                    alltext):
                return False
            return True

        while ih < len(jtxlines):
            if re.match("\d\d-\d\d", jtxlines[ih]):
                break
            if re.match("\d\d.?\d\d\d\d\d \(E\)", jtxlines[ih]):
                break
            if re.match(
                    "This record contains the text of speeches delivered in English",
                    jtxlines[ih]):
                break
            #print "agagag", jtxlines[ih]
            assert not re.search("text of speeches|verbatim(?i)", jtxlines[ih])
            self.agenda.append(jtxlines[ih].strip())
            ih += 1

        #print "ccccc", self.chairs
        lparanum = paranumC(self.undocname, None, 0, -1, self.textcountnumber)
        if len(self.chairs) not in (15,
                                    17) or len(self.seccouncilmembers) != 15:
            if self.undocname == "S-PV-3446":
                return False
            if IsNotQuiet():
                print len(self.seccouncilmembers), len(
                    self.chairs
                ), "wrong number of members or chairs\n", self.chairs
                print self.seccouncilmembers
            raise unexception("wrongnumber on council", lparanum)

        self.agenda = " ".join(self.agenda)
        self.agenda = re.sub("</?b>", " ", self.agenda)
        self.agenda = re.sub("\s\s+", " ", self.agenda)
        self.agenda = MarkupLinks(
            CleanupTags(self.agenda, "council-agenda", lparanum),
            self.undocname, lparanum)
        return True
示例#14
0
    def __init__(self, xpage, lundocname, lpageno, textcountnumber):
        self.pageno = lpageno
        self.undocname = lundocname
        self.textcountnumber = textcountnumber
        self.bSecurityCouncil = re.match("S-PV.(\d+)", self.undocname)
        self.nSecurityCouncilSession = self.bSecurityCouncil and int(
            self.bSecurityCouncil.group(1)) or 0
        self.bGeneralAssembly = re.match("A-\d+-PV", self.undocname)
        assert self.bSecurityCouncil or self.bGeneralAssembly

        # for right column, if not left justified, this adds a bit more to the right
        if self.bGeneralAssembly and int(
                re.match("A-(\d+)", lundocname).group(1)) <= 52:
            rightcolstartindentincrement = 1
        else:
            rightcolstartindentincrement = 0

        # set the column starts from some of the special cases we get
        leftcolstart = 90
        if self.bGeneralAssembly and int(
                re.match("A-(\d+)", lundocname).group(1)) <= 54:
            rightcolstart = 481
        else:
            rightcolstart = 468

        if lundocname in [
                "A-54-PV.100", "A-54-PV.96", "A-54-PV.98", "A-54-PV.99",
                "S-PV-4143", "S-PV-4143-Resu.1"
        ]:
            rightcolstart = 468
        elif lundocname in ["A-54-PV.97"]:
            rightcolstart = 486
        elif re.match("S-PV-335[0-8]", lundocname):
            rightcolstart = 468
        elif re.match("S-PV-334", lundocname):
            rightcolstart = 468
        elif self.nSecurityCouncilSession >= 4144:
            rightcolstart = 468

        #re.match("S-PV-414[4-9]", lundocname):
        #    rightcolstart = 468
        #elif re.match("S-PV-41[5-9]", lundocname):
        #    rightcolstart = 468
        #elif re.match("S-PV-4[2-9]", lundocname):
        #    rightcolstart = 468
        #elif re.match("S-PV-5", lundocname):
        #    rightcolstart = 468

        elif self.bSecurityCouncil:
            rightcolstart = 481
            rightcolstartindentincrement = 1

        # generate the list of lines, sorted by vertical position
        ftxlines = re.findall("<text.*?</text>", xpage)

        txlines = []
        for txline in ftxlines:
            txl = TextLine(txline, lundocname, lpageno, self.textcountnumber)
            self.textcountnumber += 1
            if txl.ltext:
                if txlines and txlines[-1].bfootertype and txlines[
                        -1].top == txl.top:
                    txl.bfootertype = True
                txlines.append(txl)
        txlines.sort(key=TextLineTopKey)

        # the half divider is at 459

        # try to separate out the header and footers
        if self.pageno == 1 and self.bGeneralAssembly:
            ih = self.ExtractDotLineChairHead(txlines)
            #for Dtxl in txlines[-10:]:
            #    print Dtxl.top, Dtxl.left, Dtxl.ltext

            ie = len(txlines) - 1
            while txlines[ie].bfootertype:
                #print "FOOTER:", txlines[ie].ltext
                ie -= 1
            #print "**NON-FOOTER:", txlines[ie].ltext
            ie += 1

            # the whole first page gets parsed separately
            assert not self.bSecurityCouncil

        elif self.bSecurityCouncil and self.pageno == 1:
            if not self.ExtractSeccounFrontPage(txlines):
                self.bSecurityCouncil = "ClosedSession"
            return

        # special case where the agenda spills to a second page (don't forget the outer application of this if)
        elif self.bSecurityCouncil and lundocname in twopageagendas and self.pageno == 2:
            ih = 0
            self.agenda = []
            while ih < len(txlines):
                if 132 <= txlines[ih].top < 1000:
                    self.agenda.append(txlines[ih].ltext)
                ih += 1
            self.agenda = " ".join(self.agenda)
            self.agenda = re.sub("</?b>", " ", self.agenda)
            self.agenda = re.sub("\s\s+", " ", self.agenda)
            lparanum = paranumC(self.undocname, None, 0, -1,
                                self.textcountnumber)
            self.agenda = MarkupLinks(
                CleanupTags(self.agenda, "council-agenda", lparanum),
                self.undocname, lparanum)
            return

        elif self.bGeneralAssembly:
            if re.match("<b>\w[/.]\d+/PV.\d+\s*</b>", txlines[0].ltext):
                ih = 1
            elif re.match("\d", txlines[0].ltext) and re.match(
                    "<b>\w[/.]\d+/PV.\d+\s*</b>", txlines[1].ltext):
                ih = 2
            else:
                #print txlines[0].ltext
                assert re.match("General Assembly",
                                txlines[0].ltext), txlines[0].ltext
                assert re.match("\d+(?:th|st|nd|rd) (?:plenary )?meeting",
                                txlines[1].ltext)
                assert re.match("\S+ [Ss]ession", txlines[2].ltext)
                assert re.match("\d+ \w+ \d\d\d\d", txlines[3].ltext) or (
                    lundocname in ["A-50-PV.38", "A-50-PV.40"])
                ih = 4
            ie = len(txlines) - 1
            if re.match("\d\d\-\d\d\d\d\d", txlines[ie].ltext):
                ie -= 1
            pagenumtext = re.sub("<..?>", "", txlines[ie].ltext).strip()
            if re.match("\d\d\-\d\d\d\d\d", txlines[ie - 1].ltext):
                ie -= 1
            if not re.match("\d+$", pagenumtext):
                if IsNotQuiet():
                    print "jjjj", pagenumtext, txlines[ie].ltext
                raise unexception(
                    "pagenum error not a number",
                    paranumC(self.undocname, None, 0, -1,
                             txlines[ie].textcountnumber))
            if int(pagenumtext) != self.pageno:
                if IsNotQuiet():
                    print pagenumtext, self.pageno
                raise unexception(
                    "pagenum serror of speaker-intro",
                    paranumC(self.undocname, None, 0, -1,
                             txlines[ie].textcountnumber))

        elif self.bSecurityCouncil:
            #if len(txlines) < 4:
            #    raise unexception("intro too short", paranumC(self.undocname, None, 0, -1, txlines[0].textcountnumber))

            bl0 = len(txlines) > 4 and re.match("Security Council",
                                                txlines[0].ltext)
            bl1 = len(txlines) > 4 and re.match(
                "\d+(?:th|st|nd|rd)? (?:\(Resumption(?: \d)?\) )?(?:meeting)?",
                txlines[1].ltext)
            bl2 = len(txlines) > 4 and re.match("(\w+-\w+|\w+) [Yy]ear",
                                                txlines[2].ltext)
            bl3 = len(txlines) > 4 and re.match("\d+ \w+ \d\d\d\d",
                                                txlines[3].ltext)

            bl4 = re.match(
                "<b>S/PV.\d+\s*(?:\(Resumption [\d|I]\)|\(Part [I]+\))?\s*</b>",
                txlines[0].ltext)
            bl4r = (self.undocname[5:] >= "4143")

            if bl4 and bl4r:
                ih = 1
            elif bl0 and bl1 and bl2 and bl3:
                ih = 4
            else:
                if IsNotQuiet():
                    print "\nFirst four lines on page:", self.pageno, bl4, bl4r
                    print bl0, txlines[0].ltext
                    print bl1, txlines[1].ltext
                    print bl2, txlines[2].ltext
                    print bl3, txlines[3].ltext
                    print bl4, bl4r
                raise unexception(
                    "bad page header",
                    paranumC(self.undocname, None, 0, -1,
                             txlines[0].textcountnumber))

            ie = len(txlines) - 1
            if re.match("\d\d\-\d\d\d\d\d", txlines[ie].ltext):
                ie -= 1
            pagenumtext = txlines[ie].ltext
            mpagenumtext = re.match("(?:<b>)?(\d+)\s*(?:</b>)?$", pagenumtext)
            if not mpagenumtext:
                if IsNotQuiet():
                    print "jkjk", pagenumtext
                raise unexception(
                    "pagenum error not a number",
                    paranumC(self.undocname, None, 0, -1,
                             txlines[ie].textcountnumber))
            pgoffset = int(mpagenumtext.group(1)) - self.pageno
            if pgoffset != 0 and self.undocname not in misnumberedpages:
                if IsNotQuiet():
                    print "pagenum-offset not in list", self.undocname, mpagenumtext.group(
                        1), self.pageno
                raise unexception(
                    "page pagenum error of speaker-intro",
                    paranumC(self.undocname, None, 0, -1,
                             txlines[ie].textcountnumber))
            if re.match("\d\d-\d\d\d\d\d$", txlines[ie - 1].ltext):
                ie -= 1

        else:
            assert False

        # separate out the header and footers
        self.txlheader = txlines[:ih]
        self.txlfooter = txlines[ie:]

        # separate the body into the two columns
        self.txlcol1 = []
        self.txlcol2 = []
        self.minindentleft = 9999
        self.minindentright = 9999
        for txl in txlines[ih:ie]:
            if txl.left < 459:
                #print txl.bfootertype, txl.left, txl.width, txl.top, txl.ltext  # zzzz
                # there's a bit of spilling out where the region is larger than it should be for the words as in A-56-PV.64
                if not (txl.left + txl.width <= 459):
                    if txl.left + txl.width > 501:
                        if IsNotQuiet():
                            print txl.left, txl.width, txl.left + txl.width
                            print txl.ltext
                            print "might have page no. 1 on first page (or add to twopageagendas)"
                        raise unexception(
                            "right-hand extension excessive",
                            paranumC(txl.undocname, None, 0, -1,
                                     txl.textcountnumber))
                    if not (txl.left <= 165):
                        bc = -1
                        while True:
                            assert self.txlcol1[-1].txls[
                                bc].top == txl.top  # in-line but shorter
                            if (self.txlcol1[-1].txls[bc].left <= 165):
                                break
                            bc -= 1

                txl.indent = txl.left - leftcolstart
                if txl.indent < 0:
                    if IsNotQuiet():
                        print txl.indent, txl.ltext
                    raise unexception(
                        "negative indentation",
                        paranumC(txl.undocname, None, 0, -1,
                                 txl.textcountnumber))
                self.minindentleft = min(txl.indent, self.minindentleft)
                txl.brightcol = False
                AppendToCluster(self.txlcol1, txl)

            else:
                txl.indent = txl.left - rightcolstart
                if txl.indent != 0:
                    txl.indent += rightcolstartindentincrement
                if txl.indent < 0:
                    if IsNotQuiet():
                        print txl.indent, txl.left, rightcolstart
                        print txl.ltext
                    raise unexception(
                        "negative indent on righthand column",
                        paranumC(self.undocname, None, 0, -1,
                                 self.textcountnumber))
                self.minindentright = min(txl.indent, self.minindentright)
                txl.brightcol = True
                AppendToCluster(self.txlcol2, txl)
示例#15
0
    def __init__(self, xfil, undocname):
        self.sdate = None
        self.chairs = None
        self.agenda = None
        self.tlcall = None
        self.seccouncilmembers = None
        self.bSecurityCouncil = re.match("S-PV.\d+", undocname)
        self.bGeneralAssembly = re.match("A-\d+-PV", undocname)

        xpages = StripPageTags(xfil, undocname)
        if not xpages:
            return  # bitmap type encountered
        txpages = []
        self.tlcall = []

        for i in range(len(xpages)):
            txpage = TextPage(xpages[i], undocname, i + 1, (txpages or 0)
                              and txpages[-1].textcountnumber)
            if i == 0 and txpage.bSecurityCouncil == "ClosedSession":
                if IsNotQuiet():
                    print " -- closedsession"
                self.tlcall = None
                return  # closed session encountered
            txpages.append(txpage)

            if txpage.bSecurityCouncil and i == 0:
                continue

            # special cases of agenda overflowing into two pages
            if txpage.bSecurityCouncil and i == 1 and undocname in twopageagendas:
                txpages[0].agenda = "%s %s" % (
                    txpages[0].agenda, txpage.agenda
                )  # ram it all into one paragraph (who cares)
                continue

            bmissingcolumns = undocname in ["A-61-PV.106", "A-52-PV.39"]
            if txpage.txlcol1:
                AppendCluster(self.tlcall, txpage.txlcol1[0], "newpage")
                for tlc in txpage.txlcol1[1:]:
                    AppendCluster(self.tlcall, tlc, "gapcluster")
            elif not bmissingcolumns:
                #assert i == len(xpages) - 1  # only last page can have missing columns (sometimes it's the first)
                print "page", i, "of", len(xpages)
                #print txpages[-1].textcountnumber
                raise unexception(
                    "missing column not on last page",
                    paranumC(undocname, None, 0, -1,
                             txpages[-1].textcountnumber))

            # have had a case where the first column was the blank one
            if txpage.txlcol2:
                AppendCluster(self.tlcall, txpage.txlcol2[0], "newcolumn")
                for tlc in txpage.txlcol2[1:]:
                    AppendCluster(self.tlcall, tlc, "gapcluster")
            elif not bmissingcolumns:
                assert i == len(xpages) - 1, "%d != %d" % (i, len(xpages) - 1)

        # assign ids to the clusters
        self.sdate = txpages[0].date
        paranumlast = paranumC(undocname, self.sdate, 0, -1, 0)
        for tlc in self.tlcall:
            if tlc.txls[0].pageno == paranumlast.pageno:
                paranumlast = paranumC(undocname, self.sdate,
                                       paranumlast.pageno,
                                       paranumlast.paragraphno + 1,
                                       tlc.txls[0].textcountnumber)
            else:
                paranumlast = paranumC(undocname, self.sdate,
                                       tlc.txls[0].pageno, 1,
                                       tlc.txls[0].textcountnumber)
            tlc.paranum = paranumlast

        # merge the lines together and remove double bold/italics that happen across lines
        for tlc in self.tlcall:
            jparatext = []  # don't insert spaces where there is a hyphen
            for txl in tlc.txls:
                if jparatext and not (re.search("\w[-/]$", jparatext[-1])
                                      and re.match("\w", txl.ltext)):
                    jparatext.append(" ")
                jparatext.append(txl.ltext)
            tlc.paratext = "".join(jparatext)

            tlc.paratext = re.sub("-</i> <i>", "-", tlc.paratext)
            tlc.paratext = re.sub("-</b> <b>", "-", tlc.paratext)
            tlc.paratext = re.sub("</b>\s*\.\s*<b>", ". ", tlc.paratext)
            tlc.paratext = re.sub("Secretary- General", "Secretary-General",
                                  tlc.paratext)
            tlc.paratext = re.sub(
                "\s*(?:</i>\s*<i>|</b>\s*<b>|<b>\s*</b>|<i>\s*</i>|<b>\s*<i>\s*</b>\s*</i>)\s*",
                " ", tlc.paratext)
            tlc.paratext = tlc.paratext.strip()

            tlc.paratext = re.sub(
                "^<b>(The(?: Acting)? Co-Chairperson) \(([^\)]*)\)\s*(?:</b>\s*:|:\s*</b>)",
                "<b>\\1</b> (\\2):", tlc.paratext)
            tlc.lastindent = tlc.indents[-1][0]

        self.agenda = txpages[0].agenda
        self.chairs = txpages[0].chairs
        if self.bSecurityCouncil:
            self.seccouncilmembers = txpages[0].seccouncilmembers
示例#16
0
def DetectSpeaker(ptext, indents, paranum, speakerbeforetookchair):
    #print ptext, "\n\n\n"
    if re.match("<i>(?:In favour|Against|Abstaining)",
                ptext):  # should be part of a voteblock
        print ptext
        #print tlcall[i - 1].paratext
        assert False

    if re.match(
            "(?:The agenda was adopted\.|A vote was taken by show of hands\.|There being no objection, it is so decided\.)$",
            ptext):
        if IsNotQuiet():
            print "italicizingline", len(indents), ptext
        ptext = "<i>%s</i>" % ptext

    indentationerror = ""
    if len(indents) == 1 and indents[0][0] == 0:
        if not re.match("<b> ", ptext) and not re.match(
                "(?:\(|<i>)+spoke in", ptext
        ):  # often there is a speaker with a blank space at the front
            indentationerror = "unindented-paragraph"
    if len(indents) > 2:
        indentationerror = "too many different indents"
    if len(indents) == 2 and indents[1][0] != 0:
        if (indents[0][1] == 1 and ptext[0] == '"'
                and indents[0][0] - indents[1][0] > 30):
            # turn this into a blockquote
            indents[0] = (indents[0][0], indents[0][1] + indents[1][1],
                          indents[0][2] + indents[1][2])
            del indents[1]
            if IsNotQuiet():
                pass
                #print "ququququq", indents
        else:
            indentationerror = "un-left-justified paragraph"

    mfixchinaspek = re.match(
        "<b>(Mr\. \w+)\s*</b>\s*([\w\-]+)\s*\((?:China|Republic of Korea)\)",
        ptext)
    if mfixchinaspek:
        #print "fixing chinaspeak", ptext, "\n"
        ptext = "<b>%s %s</b> %s" % (mfixchinaspek.group(1),
                                     mfixchinaspek.group(2),
                                     ptext[mfixchinaspek.end(2):])
        #print ptext

    if re.search("\s\S\s\S\s\S\s", ptext):
        print ptext
        raise unexception("probable gaps in text", paranum)

    mspek = re.match(respekp1, ptext)
    if not mspek:
        mspek = re.match(respekp2, ptext)
    if not mspek:
        mspek = re.match(respekp3, ptext)
    if not mspek:
        mspek = re.match(respek, ptext)
    assert not mspek or not re.search("[<>]", mspek.group(1))

    if not mspek and re.match("<[ib]>", ptext):
        speakerbeforetookchair = ""

    if mspek or speakerbeforetookchair:
        if indentationerror == "unindented-paragraph" and speakerbeforetookchair:
            indentationerror = False
        if indentationerror == "unindented-paragraph" and paranum.undocname in [
                "A-55-PV.60", "A-55-PV.63", "A-55-PV.64", "A-55-PV.68",
                "A-55-PV.59", "A-55-PV.44", "A-55-PV.46", "A-55-PV.48",
                "A-55-PV.49", "A-55-PV.52", "A-55-PV.56", "A-55-PV.51",
                "A-60-PV.37", "A-60-PV.38", "A-60-PV.42", "A-60-PV.51",
                "A-60-PV.79", "A-60-PV.85", "A-60-PV.91", "A-60-PV.86",
                "A-60-PV.87", "A-60-PV.92", "A-60-PV.93", "A-60-PV.94"
        ]:
            indentationerror = False
        if indentationerror:
            print ptext
            print indents
            raise unexception(indentationerror + " of speaker-intro", paranum)

    if respekSS and not mspek:
        m = re.match(respekSS, ptext)
        if IsNotQuiet():
            print ptext
            print "   ___ ", m and m.group(0)

    if mspek:
        assert not indentationerror
        assert not re.match("<i>", ptext)
        speakr = re.sub("\s+", " ", mspek.group(1).strip())
        nation = ""
        bIsNotnation = True
        lnation = mspek.group(2)

        mbumpnation = re.search("([^(]*?)\s*\(([^)]*)\)$", speakr)
        if mbumpnation and not lnation and FixNationName(
                mbumpnation.group(2), paranum.sdate):
            speakr = mbumpnation.group(1)
            lnation = mbumpnation.group(2)
            if IsNotQuiet():
                print "BBBB bumpingnat", speakr, lnation

        if lnation:
            nation = IsPrenation(lnation, paranum.sdate)
            if not nation:
                nation = FixNationName(lnation, paranum.sdate)
                bIsNotnation = not nation
            if not nation:
                nation = IsNonnation(lnation, paranum.sdate)
            if not nation:
                print ptext
                print "\ncheck if misspelt or new nonnation, can add * to front of it: ", lnation
                raise unexception("unrecognized nationC or nonnation", paranum)
        elif not re.match(
                "The(?: Acting| Temporary)? President|The(?: Deputy| Assistant)? Secretary-General|The(?: Acting)? Chairman|Transcript",
                speakr):
            if IsNotQuiet():  # allow for less strict when done by cronjob
                raise unexception("missing nation for %s" % speakr, paranum)

        if not re.match(
                "Mr\.|Mrs\.|Miss |Ms\.|Pope |The |King |Sultan |Prince |Secretary|Arch|Dr\.|Sir |Sheikh?a? |President |Monsignor |Chairman |Crown |His |Dame |Senator |Cardinal |Chief |Captain |Acting |Begum |Major-General |Shaikh |Judge |Count |Emir |Baroness |General |Nana |Princess |U |Rev\. |Kofi |Sayyid |Sheika |Bishop |Sir. |Wilmot |Eliza |Jos|Lord |Justice |Father |Commodore |Metropolitan |Transcript|Madam ",
                speakr):
            print speakr
            raise unexception("improper title on speaker", paranum)
        if re.search("[\.,:;]$", speakr):
            print speakr
            raise unexception("improper tail on speaker", paranum)
        if re.search("[,:;\(\)]", speakr):
            print speakr
            raise unexception("improper contents in speaker", paranum)

        typ = "spoken"
        currentspeaker = (speakr, nation, (mspek.group(5) or ""), bIsNotnation
                          )  # name, nation, language
        #print currentspeaker
        ptext = ptext[mspek.end(0):]
        if re.search("</b>", ptext):
            print ptext
            raise unexception("bold in spoken text", paranum)

    elif speakerbeforetookchair:
        assert not indentationerror
        typ = "spoken"
        currentspeaker = speakerbeforetookchair
        #print "Continuation speaker", speakerbeforetookchair

    # non-spoken text
    else:
        #<b>Mr. Al-Mahmoud </b>(Qatar) (<i>spoke in Arabic</i>):
        if re.match("<b>.*?(?:</b>.*?:|:</b>)(?!</b>$)", ptext):
            print ptext
            raise unexception("improperly detected spoken text", paranum)

        if re.match("\(?<i>", ptext):
            mballots = re.search("Number of ballot papers", ptext)
            if mballots:
                #print "BALLOT:", ptext, "\n"
                indentationerror = False

            if indentationerror:
                print ptext
                print indents
                raise unexception(indentationerror + " of unspoken text",
                                  paranum)

            if not mballots:
                mptext = re.match(
                    "<i>(.*?)</i>\.?\s*(?:\((?:resolutions?|decision|draft resolution) (A?[\d/]*\s*(?:\(?[A-Z,\s]*(?:and|to) [A-Z]\)?|[A-Z]{1,2})?)\))?\.?$",
                    ptext)
                if not mptext and not re.match("\(<i>spoke in", ptext):
                    print "--%s--" % ptext
                    raise unexception("improper italicline", paranum)

            ptext = re.sub("</?[ib]>", "", ptext).strip()

            # further parsing of these phrases may take place in due course
            msodecided = re.match(
                "(?:There being no objection, )?[Ii]t (?:was|is) so decided(?: \(decision [\d/]*\s*(?:A|B|C|A and B)?\))?\.?$",
                ptext)
            mwasadopted = re.match(
                ".*?(?:resolution|decision|agenda|amendment|recommendation).*?(?:was|were) adopted(?i)",
                ptext)
            mcalledorder = re.match(
                "The meeting (?:was called to order|rose|was suspended|was adjourned|resumed|was resumed) (?:at|on)",
                ptext)
            mtookchair = re.match(
                "\s*(?:In the absence of the President, )?(.*?)(?:, \(?Vice[\-\s]President\)?,)? (?:took|in) the [Cc]hair\.?$",
                ptext)
            mretchair = re.match(
                "(?:The President|.*?, Vice-President,|Mrs. Albright.*?|Baroness Amos) (?:returned to|in) the Chair.$",
                ptext)
            mescort = re.search(
                "(?:was escorted|escorted the.*?) (?:(?:from|to) the (?:rostrum|podium|platform)|(?:from|into|to its place in) the (?:General Assembly Hall|Conference Room|Security Council Chamber))(?: by the President and the Secretary-General)?\.?$",
                ptext)
            msecball = re.search(
                "A vote was taken by secret ballot\.(?: The meeting was suspended at|$)",
                ptext)
            mminsil = re.search(
                "The (?:members of the (?:General )?Assembly|Council) observed (?:a|one) minute of (?:silent prayer (?:or|and) meditation|silence)\.$",
                ptext)
            mtellers = re.search(
                "At the invitations? of the (?:Acting )?Presidents?.*?acted as tellers\.$|Having been drawn by lot",
                ptext)
            melected = re.search(
                "[Hh]aving obtained (?:the required (?:two-thirds )?|an absolute )majority.*?(?:(?:were|was|been|is) s?elected|will be included [io]n the list)",
                ptext)
            mmisc = re.search(
                "The Acting President drew the following.*?from the box|sang.*?for the General Assembly|The Assembly heard a musical performance|The Secretary-General presented the award to|From the .*? Group:|Having been drawn by lot by the (?:President|Secretary-General),|were elected members of the Organizational Committee|President \w+ and then Vice-President|Vice-President \S+ \S+ presided over|The following .*? States have.*?been elected members of the Security Council",
                ptext)
            mmiscnote = re.search("\[In the 79th plenary .*? III.\]$", ptext)
            mmstar = re.match("\*", ptext)  # insert * in the text
            mmspokein = re.match(
                "\(spoke in \w+(?:; interpretation.*?|; .*? the delegation)?\)$",
                ptext)

            matinvite = re.match(
                "(?:At the invitation of the President, )?.*? (?:(?:took (?:a |the )?|were escorted to their )seats? at the Council table|(?:took|was invited to take) (?:(?:the |a |their )?(?:seat|place)s? reserved for \w+|a seat|a place|places|seats|their seats|his seat) at the (?:side of the )?Council (?:[Cc]hamber|table))(?:;.*?Chamber)?\.$",
                ptext)
            mscsilence = re.match(
                "The members of the (?:Security )?Council observed a minute of silence.$",
                ptext)
            mscescort = re.search(
                "(?:were|was) escorted to (?:seats|a seat|his place|a place) at the (?:Security )?Council table.$",
                ptext)
            mvtape = re.match(
                "A video ?(?:tape)? was (?:shown|played|displayed) in the Council Chamber.$|An audio tape, in Arabic,|The members of the General Assembly heard a musical performance.$",
                ptext)
            mvprojscreen = re.match(
                "(?:An image was|Two images were|A video was) projected on screen\.$",
                ptext)
            mvresuadjourned = re.match(
                "The meeting was resumed and adjourned on.*? a\.m\.$", ptext)

            if mmstar:
                ptext = ptext[1:]

            # first line is from general assembly.  Second line adds in some from security council
            if not (msodecided or mwasadopted or mcalledorder or mtookchair or mretchair or mballots or mescort or msecball or mminsil or mtellers or mmisc or melected or mmstar or mmiscnote or mmspokein or \
                    mvprojscreen or matinvite or mscsilence or mscescort or mvtape or mvresuadjourned):
                print "unrecognized--%s--" % ptext
                print re.match("At the invitations? of the (?:Acting )?",
                               ptext)
                raise unexception("unrecognized italicline", paranum)

            # we can add subtypes to these italic-lines
            typ = "italicline"
            if mtookchair or mretchair:
                typ = "italicline-tookchair"
            if mmspokein:
                typ = "italicline-spokein"
            currentspeaker = None

        elif re.match("<b>", ptext):
            if not re.match(reboldline, ptext):
                print ptext
                raise unexception("unrecognized bold completion", paranum)
            ptext = re.sub("</?b>", "", ptext).strip()
            typ = "boldline"
            currentspeaker = None

        else:
            typ = "unknown"
            print ptext, indents
            raise unexception("possible indent failure", paranum)

    return ptext, typ, currentspeaker
示例#17
0
def AppendCluster(res, tlc, sclusttype):
    # check if we should merge to the next paragraph
    assert sclusttype in ["gapcluster", "newpage", "newcolumn"]

    if res and sclusttype != "gapcluster" and len(tlc.indents) == 1:
        indentp = res[-1].indents[-1][0]
        indentn = tlc.indents[0][0]

        bbothindented = ((indentp in [31, 32]) and (indentn in [31, 32])) or \
                        ((indentp in [0, 1]) and (indentn in [0, 1])) or \
                        ((indentp in [36, 33]) and (indentp == indentn))
        bonelineparacont = (len(res[-1].indents)
                            == 1) and (res[-1].indents[0][1] == 1) and (
                                indentp in [31, 32]) and (indentn in [0, 1])

        td0 = res[-1].txls[-1].ltext[:3]
        td1 = tlc.txls[0].ltext[:3]
        if not re.match("<[ib]>", td0):
            td0 = ""
        if not re.match("<[ib]>", td1):
            td1 = ""
        bstylematches = (td0 == td1)
        #assert not (bbothindented and not bstylematches)
        if re.match("<i>In favour", tlc.txls[0].ltext):
            bstylematches = False
        if re.match("<b>Agenda", res[-1].txls[-1].ltext):
            bstylematches = False

        # likely continuation of paragraph
        if bbothindented and bstylematches:
            res[-1].txls.extend(tlc.txls)
            #print tlc.txls[0].ltext
            return
        else:
            if bonelineparacont:
                if IsNotQuiet():
                    pass
                    #print "checkthiscontinuation case"
                    #print indentp, indentn, bstylematches, bonelineparacont, res[-1].indents
                    #print " ----", tlc.txls[0].ltext
                if bstylematches:
                    if IsNotQuiet():
                        pass  #print "merging"
                    res[-1].txls.extend(tlc.txls)
                    return

    # new cluster; check the indenting pattern is good
    if len(tlc.indents) == 2:
        if tlc.indents[0] <= tlc.indents[1]:
            #print tlc.indents, tlc.txls[0].ltext
            #assert re.match("<[ib]>.*?</[ib]>", tlc.txls[0].ltext) # <i>In favour:</i>
            pass

    # two paragraphs may have been merged, try to separate them out
    elif len(tlc.indents) == 4 and tlc.indents[0][0] == tlc.indents[2][
            0] and tlc.indents[1][0] == tlc.indents[3][0]:
        if IsNotQuiet():
            pass  #print tlc.indents
        assert tlc.indents[0][0] == tlc.indents[2][0]
        assert tlc.indents[1][0] == tlc.indents[3][0]
        si = tlc.indents[0][2] + tlc.indents[1][2]
        tlcf = TextLineCluster(None)
        tlcf.txls = tlc.txls[:si]
        del tlc.txls[:si]
        tlcf.indents = tlc.indents[:2]
        del tlc.indents[:2]
        res.append(tlcf)
        if IsNotQuiet():
            pass
            #print "# paragraphs", si
            #print " ", tlc.txls[0].ltext
            #print tlcf.indents, tlc.indents

    elif len(tlc.indents) != 1:
        if IsNotQuiet():
            print tlc.indents, "jjjj"
        prevtop = -1
        for txl in tlc.txls:
            if IsNotQuiet():
                if prevtop == txl.top:
                    print " ",
                print txl.indent, txl.ltext
            prevtop = txl.top
        raise unexception(
            "unrecognized indent pattern",
            paranumC(txl.undocname, None, 0, -1, txl.textcountnumber))
        assert False
    res.append(tlc)
    return
示例#18
0
def GroupParas(tlcall, undocname, sdate, seccouncilmembers):
    res = []
    i = 0
    currentspeaker = None
    curragendanum = ""
    while i < len(tlcall):
        tlc = tlcall[i]
        if re.match(recvoterequest, tlc.paratext):
            lblock = VoteBlock(tlcall, i, undocname, sdate, seccouncilmembers)
            i = lblock.i

        # non-voting line to be processed
        else:

            speakerbeforetookchair = ""
            if (len(res) > 2) and (res[-1].typ in [
                    "italicline-tookchair", "italicline-spokein"
            ]) and (res[-2].typ == "spoken"):
                speakerbeforetookchair = res[-2].speaker
                if res[-1].typ == "italicline-spokein":
                    assert len(res[-1].paragraphs) == 1
                    mspokein = re.search("spoke in (\w+)",
                                         res[-1].paragraphs[0][1])
                    if not mspokein:
                        if IsNotQuiet():
                            print "unrecognized spokein", res[-1].paragraphs
                    #print "converting spokein", speakerbeforetookchair[2], mspokein.group(1)
                    speakerbeforetookchair = (speakerbeforetookchair[0],
                                              speakerbeforetookchair[1],
                                              mspokein.group(1),
                                              speakerbeforetookchair[3])

            lblock = SpeechBlock(tlcall, i, undocname, sdate,
                                 speakerbeforetookchair, curragendanum)
            if lblock.agendanum:
                curragendanum = lblock.agendanum

            i = lblock.i

        if res and res[-1].paranum.pageno == lblock.paranum.pageno:
            lblock.paranum.blockno = res[-1].paranum.blockno + 1
        else:
            lblock.paranum.blockno = 1
        res.append(lblock)

    # find the rosetime
    if res:
        res[-1].rosetime = res[-1].ExtractRoseTime(sdate[10:].strip())
        if undocname in [
                "S-PV-3698", "S-PV-3698-Resu.1", "S-PV-3765-Resu.2",
                "S-PV-4072-Resu.1", "S-PV-4174", "S-PV-4223", "S-PV-5100"
        ]:
            assert not res[-1].rosetime
            res[-1].rosetime = sdate[10:].strip()  # the missing rosetimes
        if not res[-1].rosetime:
            if undocname == "A-62-PV.79":
                res[-1].rosetime = "06:05"
            else:
                res[-1].writeblock(sys.stdout)
                raise unexception("can't find rosetime", res[-1].paranum)

    return res
示例#19
0
def ParsetoHTML(stem, pdfxmldir, htmldir, bforceparse, beditparse,
                bcontinueonerror):
    undocnames = []
    for undoc in os.listdir(pdfxmldir):
        undocname = os.path.splitext(undoc)[0]
        if undoc[-1] == "~":
            continue
        if not re.match(stem, undocname):
            continue
        if re.search("Corr", undocname):  # skip corregendas
            continue
        if not bforceparse:
            undochtml = os.path.join(htmldir, undocname + ".html")
            undochtmlunindexed = os.path.join(htmldir,
                                              undocname + ".unindexed.html")
            if os.path.isfile(undochtml) or os.path.isfile(undochtmlunindexed):
                continue
        undocnames.append(undocname)

    undocnames.sort()
    if IsNotQuiet():
        print "Preparing to parse %d files" % len(undocnames)

    for undocname in undocnames:
        undocpdfxml = os.path.join(pdfxmldir, undocname + ".xml")
        undochtml = os.path.join(htmldir, undocname +
                                 ".html")  # used to be ".unindexed.html"

        gparas = None
        lbeditparse = beditparse
        while not gparas:
            fin = open(undocpdfxml)
            xfil = fin.read()
            fin.close()

            if IsNotQuiet():
                print "parsing:", undocname,
            try:
                if lbeditparse:
                    lbeditparse = False
                    raise unexception("editparse", None)
                glueunfile = GlueUnfile(xfil, undocname)
                if not glueunfile.tlcall:
                    break  # happens when it's a bitmap type, or communique
                if IsNotQuiet():
                    print glueunfile.sdate  #, chairs
                gparas = GroupParas(glueunfile.tlcall, undocname,
                                    glueunfile.sdate,
                                    glueunfile.seccouncilmembers)

            except unexception, ux:
                assert not gparas
                if ux.description != "editparse":
                    if bcontinueonerror:
                        break
                    print "\n\nError: %s on page %s textcounter %s" % (
                        ux.description, ux.paranum.pageno,
                        ux.paranum.textcountnumber)
                print "\nHit RETURN to launch your editor on the pdfxml file (or type 's' to skip, or 't' to throw)"
                rl = sys.stdin.readline()
                if rl[0] == "s":
                    break
                if rl[0] == "t":
                    raise

                if ux.description != "editparse":
                    fin = open(undocpdfxml, "r")
                    finlines = fin.read()
                    fin.close()
                    mfinlines = re.match(
                        "(?s)(.*?<text ){%d}" % ux.paranum.textcountnumber,
                        finlines)
                    ln = mfinlines.group(0).count("\n")
                else:
                    ln = 1

                #editor = os.getenv('EDITOR')
                if sys.platform == "win32":
                    os.system('"C:\Program Files\ConTEXT\ConTEXT" %s /g00:%d' %
                              (undocpdfxml, ln + 2))
                else:
                    os.system('vim "%s" +%d' % (undocpdfxml, ln + 2))

        if not gparas:
            continue

        # actually write the file
        tmpfile = undochtml + "--temp"
        fout = open(tmpfile, "w")
        fout.write('<html>\n<head>\n')
        fout.write(
            '<link href="unview.css" type="text/css" rel="stylesheet" media="all">\n'
        )
        fout.write('</head>\n<body>\n')

        fout.write('\n<div class="heading" id="pg000-bk00">\n')

        sdate, stime = glueunfile.sdate[:10], glueunfile.sdate[10:].strip()
        fout.write(
            '\t<span class="code">%s</span> <span class="date">%s</span> <span class="time">%s</span>'
            % (undocname, sdate, stime))
        if gparas:
            fout.write('<span class="rosetime">%s</span>' %
                       gparas[-1].rosetime)

        fout.write('\n</div>\n')

        if glueunfile.bSecurityCouncil:
            fout.write('\n<div class="council-agenda" id="pg000-bk01">\n')
            fout.write(
                '\t<p class="boldline-p" id="pg000-bk01-pa01">%s</p>\n' %
                glueunfile.agenda)
            fout.write('</div>\n')
            fout.write('\n<div class="council-attendees" id="pg000-bk02">\n')
            ichairn = 0
            for chair in glueunfile.chairs:
                ichairn += 1
                fout.write('\t<p id="pg000-bk02-pa%02d">' % ichairn)
                for chperson in chair[0].split(
                        "/"
                ):  # just for the extremely rare case we get two people sharing the seat
                    fout.write('<span class="name">%s</span> ' %
                               chperson.strip())
                fout.write(
                    '<span class="nation">%s</span> <span class="place">%s</span></p>\n'
                    % (chair[1], chair[2]))
            fout.write('</div>')

        if glueunfile.bGeneralAssembly:
            fout.write('\n<div class="assembly-chairs" id="pg000-bk03">\n')
            ichairn = 0
            for chair in glueunfile.chairs:
                ichairn += 1
                fout.write(
                    '\t<p id="pg000-bk03-pa%02d"><span class="name">%s</span> <span class="nation">%s</span> <span class="place">president</span></p>\n'
                    % (ichairn, chair[0], chair[1]))
            fout.write('</div>\n')

        for gpara in gparas:
            gpara.writeblock(fout)

        # this for making the parsing a little easier
        fout.write('\n<div class="end-document" id="pg999-bk99">\n')
        fout.write('</div>\n')

        fout.write('\n</body>\n</html>\n')
        fout.close()
        if os.path.isfile(undochtml):
            os.remove(undochtml)
        os.rename(tmpfile, undochtml)
示例#20
0
    def __init__(self, tlcall, i, lundocname, lsdate, speakerbeforetookchair,
                 prevagendanum):
        self.tlcall = tlcall
        self.i = i
        self.sdate = lsdate
        self.undocname = lundocname
        self.bSecurityCouncil = re.match("S-PV.\d+", self.undocname)
        if not self.bSecurityCouncil:
            self.genasssess = re.match("A-(\d+)", self.undocname).group(1)
        self.agendanum = ""

        self.pageno, self.paranum = tlcall[i].txls[0].pageno, tlcall[i].paranum
        # paranum = ( undocname, sdate, tlc.txls[0].pageno, paranumber )
        #self.gid = self.paranum.MakeGid()

        tlc = self.tlcall[self.i]
        #print "\npppp", tlc.indents, tlc.paratext, tlc.txls
        ptext, self.typ, self.speaker = DetectSpeaker(tlc.paratext,
                                                      tlc.indents,
                                                      self.paranum,
                                                      speakerbeforetookchair)
        ptext = MarkupLinks(CleanupTags(ptext, self.typ, self.paranum),
                            self.undocname, self.paranum)
        self.i += 1
        if self.typ in [
                "italicline", "italicline-tookchair", "italicline-spokein"
        ]:
            self.paragraphs = [("italicline", ptext)]
            return

        # series of boldlines
        if self.typ == "boldline":
            self.agendanum = ""
            blinepara = tlc.lastindent and "blockquote" or "p"

            # detect the agenda
            if not self.bSecurityCouncil:
                self.agendanum = DetectAgendaForm(ptext, self.genasssess,
                                                  prevagendanum, self.paranum)
                #print "aaaaa  ", self.agendanum
                if not self.agendanum:
                    if IsNotQuiet():
                        print "if no agenda, add to AgendaTypeMap"
                    raise unexception(" uncategorized agenda title",
                                      self.paranum)

            self.paragraphs = [(blinepara, ptext)]
            while self.i < len(self.tlcall):
                tlc = self.tlcall[self.i]
                if not re.match(reboldline, tlc.paratext):
                    break
                ptext = MarkupLinks(
                    CleanupTags(tlc.paratext, self.typ, self.paranum),
                    self.undocname, self.paranum)

                # a second agenda number gets found
                if not self.bSecurityCouncil and re.match(
                        "Agenda(?: item)? \d+(?i)", ptext):
                    agendanum2 = DetectAgendaForm(ptext, self.genasssess,
                                                  prevagendanum, self.paranum)
                    print "agendanum from second line", agendanum2
                    assert agendanum2, ptext  # must detect it
                    if re.search("misc|show|address", self.agendanum):
                        self.agendanum = agendanum2  # a woolly agenda can be over-ridden
                    elif self.undocname == "A-62-PV.74":
                        self.agendanum = "%s,%s" % (self.agendanum, agendanum2)
                    else:
                        print self.agendanum
                        print ptext
                        raise unexception(" unknown extra agendanum case",
                                          self.paranum)
                    print "aaaa2aa  ", self.agendanum
                self.paragraphs.append((tlc.lastindent and "boldline-indent"
                                        or "boldline-p", ptext))
                self.i += 1
            return

        # actual spoken section
        assert self.typ == "spoken"
        assert tlc.lastindent == 0 or len(
            tlc.indents) == 1  # doesn't happen in first paragraph of speech
        self.paragraphs = [("p", ptext)]
        while self.i < len(self.tlcall):
            tlc = self.tlcall[self.i]
            if self.DetectEndSpeech(tlc.paratext, tlc.lastindent, self.sdate):
                break
            ptext = MarkupLinks(
                CleanupTags(tlc.paratext, self.typ, self.paranum),
                self.undocname, self.paranum)
            bIndent = (len(tlc.indents) == 1) and (
                tlc.indents[0][0] != 0) and (tlc.indents[0][1] > 1)
            self.paragraphs.append(((bIndent and "blockquote" or "p"), ptext))
            self.i += 1