Exemplo n.º 1
0
def StripPageTags(xfil, undocname):
    xpages = re.findall("(<page.*\s[\s\S]*?)</page>", xfil)
    mpage1head = re.match("([\s\S]*?)(?=<text)", xpages[0])
    #print len(xpages), undoc
    if not mpage1head:
        if IsNotQuiet():
            print " -- bitmap type"
        for xpage in xpages:
            if not re.match(pagebitmap, xpage):
                print xpage
                print undocname
                assert False
        return False
    if not re.match(page1bit, mpage1head.group(1)):
        if IsNotQuiet():
            print "Probably is a bitmap type"
            print mpage1head.group(1)
            assert False
        return False
    res = [xpages[0][mpage1head.end(0):]]

    for i in range(1, len(xpages)):
        mpageihead = re.match(pageibit, xpages[i])
        if int(mpageihead.group(1)) != i + 1:
            if undocname not in misnumberedpages:
                print "misnumberedpages", mpageihead.group(
                    1), i + 1, undocname, "not in list:", misnumberedpages
                assert False

        res.append(xpages[i][mpageihead.end(0):])
    return res
Exemplo n.º 2
0
    def ExtractDateTime(self, txline, ltext):
        # extract the date out if poss
        mdate = re.match(
            "\w+\s*, (\d+)\s+(\w+)\s+(\d+),\s*(?:at )?(\d+)[\.:]?(\d*)(?:\s+([ap])\.?\s*m\.?| noon\.?)?(?: \(closed\))?$",
            ltext)
        if not mdate:  #Tuesday, 3 December 2002, 10 a.m.
            if re.search("Friday", ltext) and IsNotQuiet():
                print ltext, re.match(
                    "\w+\s*, (\d+)\s+(\w+)\s+(\d+),\s*(?:at )?(\d+)[\.:]?(\d*)(?:\s+([ap])\.?m\.?| noon\.?)?(?: \(closed\))?",
                    ltext)
            return

        #print txlines[ih].ltext
        iday = int(mdate.group(1))
        if mdate.group(2) not in months:
            if IsNotQuiet():
                print mdate.group(2), months
            raise unexception(
                "unrecognized month",
                paranumC(txline.undocname, None, 0, -1,
                         txline.textcountnumber))
        imonth = months.index(mdate.group(2))
        syear = mdate.group(3)
        if not re.match("(?:20\d\d|19\d\d)$", syear):
            raise unexception(
                "bad year",
                paranumC(txline.undocname, None, 0, -1,
                         txline.textcountnumber))
        ihour = int(mdate.group(4))
        imin = mdate.group(5) and int(mdate.group(5)) or 0
        if mdate.group(6) and mdate.group(6) == "a" and ihour == 12:
            ihour = 0
        elif mdate.group(6) and mdate.group(6) == "p" and ihour != 12:
            ihour += 12
        if self.date:
            raise unexception(
                "date redefined",
                paranumC(txline.undocname, None, 0, -1,
                         txline.textcountnumber))
        if not (0 <= ihour <= 23) or not (0 <= imin <= 59):
            if IsNotQuiet():
                print ltext
            raise unexception(
                "bad time",
                paranumC(txline.undocname, None, 0, -1,
                         txline.textcountnumber))
        self.date = "%s-%02d-%02d %02d:%02d" % (syear, imonth + 1, iday, ihour,
                                                imin)
Exemplo n.º 3
0
def ExtractPVlinks(meetingrecs):
    mpvcode = re.match("S/PV\.(\d+)\s*(?:\(Resumption\s*([I\d]*\))\s*)?(?:\(Part\s*(I*)\)\s*)?(\(closed\))?$", meetingrecs[0])
    assert mpvcode, meetingrecs
    #print meetingrecs
    pvcode = "S-PV-%s" % mpvcode.group(1)
    meetingnumber = int(mpvcode.group(1))
    secondarymeetingnumber = 0
    if mpvcode.group(2):  # needs to have the bracket so there is always something
        rv = mpvcode.group(2)[:-1]
        if not rv or rv == "I":
            rv = "1"
        pvcode = "%s-Resu.%d" % (pvcode, int(rv))
        secondarymeetingnumber = int(rv)

    if mpvcode.group(3):
        assert not secondarymeetingnumber  # parts and resu. don't mix
        if mpvcode.group(3) == "I":
            rp = 1
        elif mpvcode.group(3) == "II":
            rp = 2
        pvcode = "%s-Part.%d" % (pvcode, rp)
        secondarymeetingnumber = rp

    corrs = [ ]
    for corr in meetingrecs[1:]:
        if corr:
            mcorr = re.match("Corr\.(\d)\s*$", corr)
            assert mcorr, meetingrecs
            assert int(mcorr.group(1)) >= len(corrs) + 1, meetingrecs  # sometimes misses a corr
            corrs.append("%s-Corr.%d" % (pvcode, int(mcorr.group(1))))

    #print pvcode, meetingrecs[0]
    if mpvcode.group(4) and IsNotQuiet():
        print "the closed one:", pvcode
    return pvcode, (meetingnumber, secondarymeetingnumber), corrs
Exemplo n.º 4
0
def ConvertXML(stem, pdfdir, pdfxmldir, bForce):
    for sd in os.listdir(pdfdir):
        if stem and not re.match(stem, sd):
            continue
        sdn, sde = os.path.splitext(sd)
        if sde != ".pdf":
            continue
        pdf = os.path.join(pdfdir, sd)
        xmldest = os.path.join(pdfxmldir, sdn + ".xml")
        if os.path.isfile(xmldest):
            if not bForce:
                #if IsNotQuiet():
                #    print "skipping", sd
                continue
            os.remove(xmldest)

        #shutil.copyfile(pdf, pdfdest)
        tmpxml = "temph.xml"
        cmd = 'pdftohtml -xml "%s" "%s"' % (pdf, os.path.splitext(tmpxml)[0])
        if IsNotQuiet():
            print cmd
        else:
            cmd = cmd + " >/dev/null 2>&1" # can't turn off output, so throw away even stderr yeuch
        os.system(cmd)
        if not os.path.isfile(tmpxml):
            print "Failed to execute and generate file"
            print cmd
            continue
        if sys.platform == "win32" and os.path.isfile(xmldest): # can't rename onto existing file in Windows
            os.remove(xmldest)
        os.rename(tmpxml, xmldest)
Exemplo n.º 5
0
def WriteSCSummaries(stem, scsummariesdir, htmldir, pdfdir):
    screcords = [ ]
    
    # this is iterating through the pages of indexes, so will be in order    
    for lf in reversed(sorted(os.listdir(scsummariesdir))):
        if re.match("\.svn", lf):
            continue
        myear = re.search("\d\d\d\d", lf)
        assert myear, lf
        year = myear.group(0)
        if stem and not re.match(stem, year):
            continue
        if IsNotQuiet():
            print "year", year
        
        f = os.path.join(scsummariesdir, lf)
        fin = open(f)
        ftext = fin.read()
        fin.close()

        for mrow in re.finditer('(?s)<tr valign="top">(.*?)</tr>', ftext):
            row = mrow.group(1).strip()
            screcord = SCrecord(year, row, htmldir)
            screcord.FindTopicCats(htmldir, pdfdir)
            screcord.nextpvcode = screcords and screcords[-1].pvcode or None
            screcords.append(screcord)
            model.load_sc_topics(screcord.pvcode, screcord.otopicrecstr, screcord.datetime, screcord.datetimeend, 
                                 screcord.topics, screcord.minutes, screcord.numspeeches, screcord.numparagraphs, screcord.numvotes, screcord.nextpvcode)
Exemplo n.º 6
0
    def ExtractDotLineChair(self, txlines, ih):
        assert self.pageno == 1
        #<text top="334" left="185" width="584" height="17" font="2">Mr.  Kavan  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . (Czech Republic)</text>
        while True:
            #print "------" + txlines[ih].ltext
            mchair = re.search("([^>:]*?)\s*\. \. \. \. \.", txlines[ih].ltext)
            if mchair:
                break

            # fix missing year date
            #if self.undocname == "A-55-PV.44" and txlines[ih].ltext == "Monday, 30 October, 10 a.m.":
            #    txlines[ih].ltext = "Monday, 30 October 2000, 10 a.m."
            self.ExtractDateTime(txlines[ih], txlines[ih].ltext)

            ih += 1
            if ih == len(txlines):
                return -1

        if not self.date:
            if IsNotQuiet():
                for i in range(ih):
                    print "--%s--" % txlines[i].ltext
            raise unexception(
                "dotlinechair date problem",
                paranumC(txlines[ih].undocname, None, 0, -1,
                         txlines[ih].textcountnumber))
            assert False

        # when country name for the president . . . . is not on same line
        mcountry = re.search("\((.*?)\)$", txlines[ih].ltext)
        if not mcountry:
            ih += 1
            #print txlines[ih].ltext
            mcountry = re.match("\((.*?)\)$", txlines[ih].ltext)
            if not mcountry:
                if IsNotQuiet():
                    print txlines[ih].ltext
                raise unexception(
                    "unable to extract country from  ...-line",
                    paranumC(txlines[ih].undocname, None, 0, -1,
                             txlines[ih].textcountnumber))
        ih += 1
        chairname = re.sub("\s\s+", " ", mchair.group(1)).strip()
        self.chairs.append(
            (chairname, FixNationName(mcountry.group(1), self.date)))
        return ih
Exemplo n.º 7
0
def ScrapeContentsPageFromStem(stem):
    # this attempts to scrape PV and corrigenda assembly vertbatims by generating the codes
    # we could lead on from the last known
    mpv = re.match("A-(\d+)-PV$", stem)
    if mpv:
        # these should search for gaps
        repv = re.compile("A-%s-PV.(\d+)(?:-Corr.(\d+))?" % mpv.group(1))
        pvdone = []
        for f in os.listdir(pdfdir):
            mfm = repv.match(f)
            if mfm:
                pvdone.append(int(mfm.group(1)))

        # onwards values
        pvdone.sort()
        if IsNotQuiet():
            print "pvddd", pvdone
        v = (pvdone and pvdone[-1] or 0)
        vn = v + 1
        while vn - v < 3:
            if ScrapePDF("A-%s-PV.%d" % (mpv.group(1), vn)):
                v = vn
                ScrapePDF("A-%s-PV.%d-Corr.1" % (mpv.group(1), vn))
            vn += 1

        # missing values
        while len(pvdone) >= 2:
            vn = pvdone[-1] - 1
            if pvdone[-2] < vn:
                if ScrapePDF("A-%s-PV.%d" % (mpv.group(1), vn)):
                    ScrapePDF("A-%s-PV.%d-Corr.1" % (mpv.group(1), vn))
                pvdone[-1] = vn
            else:
                del pvdone[-1]

        return

    # this works from other contents pages for general assemblies
    if stem in scrapepvurlmap:
        ScrapeContentsPage(scrapepvurlmap[stem])
        return

    # security council scrapage
    mspv = re.match("S-(\d+)-PV", stem)
    if mspv:
        assert 1994 <= int(mspv.group(1)) < 2009   # should use current yeaR
        ScrapeSCContentsPage(int(mspv.group(1)), "http://www.un.org/Depts/dhl/resguide/scact%s.htm" % mspv.group(1))
        return

    print "Allowable stems for scraping are 'A-\d\d-PV' or 'S-\d\d\d\d(year)-PV', or"
    print ",\n  ".join(scrapepvurlmap.keys())
    assert False
Exemplo n.º 8
0
def WriteAgendaSummaries(stem, htmldir):
    rels = GetAllHtmlDocs("", False, False, htmldir)

    agendagroups = {}
    for htdoc in rels:
        maga = re.search("(A-\d\d-PV\.\d+)\.(?:unindexed\.)?html", htdoc)
        masc = re.search(
            "(S-PV.\d+(?:-(?:Resu|Part)\.\d)?)\.(?:unindexed\.)?html", htdoc)

        if not maga:
            if not masc:
                print "Whatis", htdoc
            continue

        docid = maga.group(1)

        if stem and not re.match(stem, docid):
            continue

        fin = open(htdoc)
        ftext = fin.read()
        fin.close()

        mdate = re.search('<span class="date">(\d\d\d\d-\d\d-\d\d)</span>',
                          ftext)
        sdate = mdate.group(1)

        if IsNotQuiet():
            print docid,
        agendasdoc = AddAgendaGroups(agendagroups, sdate, docid, ftext)
        #if len(agendagroups) > 100:
        #    print "preeeematureabort"
        #    break

        # copy agenda data into database
        gaagindoc = []
        for ag in agendasdoc:
            gaagindoc.append(
                (ag.subheadingid, ag.agendanumstr, "||".join(ag.titlelines)))
        model.load_ga_debate(docid, sdate, gaagindoc)

    # the agendagroups are lists of agenda items; call them topics
    allagendas = []
    recentagendas = []
    for agendanum, aggroup in agendagroups.iteritems():
        agsession = aggroup[0][1].nsession
        mctitle, mccategory = FindDelCommonTitle(agendanum, aggroup)
        model.load_ga_agendanum(agsession, agendanum, mctitle, mccategory,
                                [(ag.docid, ag.subheadingid)
                                 for ag0, ag in aggroup])
Exemplo n.º 9
0
def ScrapeGASummaries(gasummariesdir):
    for sess in range(1, currentsession + 1):
        f = os.path.join(gasummariesdir, "gaact%d.html" % sess)
        url = GASummariesURL(sess)

        if sess == currentsession or (sess == currentsession - 1
                                      and currentmonth
                                      == 9) or not os.path.isfile(f):
            if IsNotQuiet():
                print "Scraping", url
            fin = urllib2.urlopen(url)
            gaindext = fin.read()
            fin.close()

            fout = open(f, "w")
            fout.write(gaindext)
            fout.close()
Exemplo n.º 10
0
def ScrapeContentsPage(contentsurl):
    if IsNotQuiet():
        print "URL index:", contentsurl

    fin = urllib2.urlopen(contentsurl)
    plenaryindex = fin.read()
    fin.close()

    # <a href="http://daccess-ods.un.org/access.nsf/Get?OpenAgent&DS=A/57/PV.1&Lang=E" target="_blank">A/57/PV.1</a>

    plenaryindexlist = re.findall('<a\s+href="(http://daccess[^"]*)" target="_blank">(.*?)</a>(?is)', plenaryindex)
    if not plenaryindexlist:
        plenaryindexlist = re.findall('<a target="_blank" href="(http://daccess[^"]*)">(.*?)</a>(?i)', plenaryindex)
    for plenary in plenaryindexlist[:]:
        undocname = re.sub("/", "-", plenary[1])
        undocname = re.sub("\s|<.*?>", "", undocname)
        undocname = re.sub("SecurityCouncilresolution", "S-RES-", undocname)
        assert re.match("(?:A-RES-\d\d-\d+|A-\d\d-PV-\d+|S-RES-\d+\(\d+\))$", undocname)
        ScrapePDF(undocname, contentsurl, plenary[0])
Exemplo n.º 11
0
    def __init__(self, txline, lundocname, lpageno, textcountnumber):
        mxline = re.match(
            '<text top="(\d+)" left="(\d+)" width="-?(\d+)" height="(\d+)" font="(\d+)">(.*?)</text>',
            txline)
        if not mxline:
            print txline, "tttttt"
        self.top = int(mxline.group(1))
        self.left = int(mxline.group(2))
        self.width = int(mxline.group(3))
        self.height = int(mxline.group(4))
        self.font = int(mxline.group(5))
        self.pageno = lpageno
        self.undocname = lundocname
        self.textcountnumber = textcountnumber
        self.ltext = mxline.group(6).strip()

        self.ltext = re.sub("<i>\s*</i>|<b>\s*</b>", " ", self.ltext)
        if re.match("<[ib]>\s*</[ib]>|\s*$", self.ltext):
            self.ltext = ""

        # will be removed
        if not self.ltext:
            return

        self.bfootertype = (self.left < 459
                            and self.left + self.width > 459) or re.match(
                                footertext, self.ltext)
        #if self.bfootertype:
        #    print self.ltext

        # move on any short bits that are like 13^(th)
        if self.height == 11 and not self.bfootertype and self.width <= 10:
            #print self.left, self.width, "'%s'" % self.ltext
            assert self.width <= 10
            if self.ltext not in ["th", "rd", "st", "nd"]:
                if IsNotQuiet():
                    print self.ltext
                raise unexception(
                    "unrecognized shortbit",
                    paranumC(self.undocname, None, 0, -1,
                             self.textcountnumber))
            self.top += 2  # push the step down from 16 to 18
Exemplo n.º 12
0
def ScrapeSCSummaries(scsummariesdir):
    #print "Skipping ScrapeSCSummaries"
    #return

    currentdate = datetime.date.today()
    currentyear = currentdate.year
    currentmonth = currentdate.month
    for y in range(1994, currentyear + 1):
        f = os.path.join(scsummariesdir, "scact%d.html" % y)
        url = "http://www.un.org/Depts/dhl/resguide/scact%d.htm" % y
        if y == currentyear or (y == currentyear - 1 and currentmonth == 1) or not os.path.isfile(f):
            if IsNotQuiet():
                print "Scraping", url
            fin = urllib2.urlopen(url)
            scindext = fin.read()
            fin.close()

            fout = open(f, "w")
            fout.write(scindext)
            fout.close()
Exemplo n.º 13
0
def LoadAllVotes(rels):
    res = { }  # { nation : { voterecid: vote } }
    for nation in nationdates:
        res[nation] = { }
    res["Brunei Darussalam"] = {}# quick fix

    for rel in rels:
        if IsNotQuiet():
            print "loading:", rel
        fin = open(rel)
        doccontent = fin.read()
        fin.close()

        document_id = re.search('<span class="code">([^<]*)</span>', doccontent).group(1)
        for recvotet in re.findall('<p class="votelist" id="(pg\d+-bk\d+)-pa\d+">(.*?)</p>', doccontent):
            #print document_id, recvotet[0]
            for voten in re.findall('<span class="([^"]*)">([^<]*)</span>', recvotet[1]):
                res[voten[1]][(document_id, recvotet[0])] = re.match(".*?([^\-]*)", voten[0]).group(1)

    #print res["Sudan"]
    return res
Exemplo n.º 14
0
def AppendToCluster(txlcol, txl):

    # frig the indentation on the most common mistakes
    if re.match(
            "<i>The meeting (?:was called|was suspended|rose at|was resumed)",
            txl.ltext) and (txl.indent == 0):
        txl.indent = 31

    if not txlcol:
        txlcol.append(TextLineCluster(txl))
        return
    txl.vgap = txl.top - txlcol[-1].txls[-1].top

    #print txlcol[-1].txls[-1].ltext
    #print txl.vgap, txl.width, txl.height, txl.top,  txl.ltext  # zzzz

    # frig vgaps in some cases where the spacing was wider than normal
    if txl.undocname in ["A-50-PV.84", "A-50-PV.88"]:
        if txl.vgap == 21 or txl.vgap == 22:
            txl.vgap = 18
        if txl.vgap == 42:
            txl.vgap = 43
    if txl.undocname == "S-PV-5584":
        if txl.vgap == 20:
            txl.vgap = 19

    if not txl.vgap in familiarvgaps:
        if IsNotQuiet():
            print "\n\n   vgap=", txl.vgap, "\n\nwidth/height/top", txl.width, txl.height, txl.top, txl.ltext  # zzzz
            print " familiar vgaps:", familiarvgaps
        raise unexception(
            "vgap not familiar",
            paranumC(txl.undocname, None, 0, -1, txl.textcountnumber))
    if txl.vgap in (0, 17, 18, 19) or txl.vgap == 0:
        txlcol[-1].AddLine(txl)
    else:
        #print txl.vgap, "vvvv", txl.ltext
        txlcol.append(TextLineCluster(txl))
Exemplo n.º 15
0
 def DetectAdoption(self):
     adtext = re.sub("</?i>", "", self.tlcall[self.i].paratext)
     madtext = re.search(
         "(adopted|carried|retained.*?|rejected)(?:, as amended,|, as a whole,)?\s+by(?: votes)?\s+(\d+)(?:\s+votes)? to\s+(\d+|none)(?:,? with (\d+) abstentions?)?",
         adtext)
     if not madtext:
         madtext = re.match(
             "(By)\s+(\d+)(?:\s+votes)? to\s+(\d+|none)(?:,? with (\d+) abstentions?)?",
             adtext)
     if not madtext:
         print "--%s-- %d" % (adtext, self.i)
         raise unexception("by votes problem", self.tlcall[self.i].paranum)
     ifavour = int(madtext.group(2))
     iagainst = (madtext.group(3) != "none" and int(madtext.group(3)) or 0)
     #if madtext.group(1) == "rejected":
     #    i = ifavour;  ifavour = iagainst;  iagainst = i
     iabstain = (madtext.group(4) and int(madtext.group(4)) or 0)
     if madtext.group(1) == "rejected":
         il = (iagainst, ifavour, iabstain)
     else:
         il = (ifavour, iagainst, iabstain)
     ivl = (len(self.vlfavour), len(self.vlagainst), len(self.vlabstain))
     if il != ivl:
         if IsNotQuiet():
             print "wrong-count", self.undocname, il, ivl
         # wrong values are found on A-57-PV.73 s(favour=154, 152)
         if self.undocname not in [
                 "A-56-PV.82",
                 "A-57-PV.73",
                 "A-58-PV.54",
                 "A-52-PV.69",
                 "A-50-PV.90",
                 "A-49-PV.83",
         ]:
             raise unexception("wrong votecount",
                               self.tlcall[self.i].paranum)
     self.motiontext = MarkupLinks(adtext, self.undocname, self.paranum)
     self.i += 1
Exemplo n.º 16
0
# in capital letters.  Not currently incorporated into the system.
if bGAsummaries:
    agsummariesdir = os.path.join(indexstuffdir, "gasummariesdir")
    if not os.path.isdir(agsummariesdir):
        os.mkdir(agsummariesdir)
    ScrapeGASummaries(agsummariesdir)
    sess = 1
    ParseScrapeGASummaries(agsummariesdir, pdfinfodir, sess)

if bNationData:
    ScrapePermMissions()
    NationDataSucker()

if bVoteDistances:
    f = os.path.join(indexstuffdir, "votetable.txt")
    if IsNotQuiet():
        print "Writing vote distance to file:", f
    fout = open(f, "w")
    WriteVoteDistances(stem, htmldir, fout)
    fout.close()

if bDocimages:
    GenerateDocimages(stem, options.forcedocimg, options.limit, pdfdir, pdfpreviewdir, pdfinfodir, tmppdfpreviewdir)

# this may be out-dated
if bScrapewp:
    FetchWikiBacklinks(commentsdir)

if bLoadMPs:
    LoadMPs()
Exemplo n.º 17
0
def ParsetoHTML(stem, pdfxmldir, htmldir, bforceparse, beditparse,
                bcontinueonerror):
    undocnames = []
    for undoc in os.listdir(pdfxmldir):
        undocname = os.path.splitext(undoc)[0]
        if undoc[-1] == "~":
            continue
        if not re.match(stem, undocname):
            continue
        if re.search("Corr", undocname):  # skip corregendas
            continue
        if not bforceparse:
            undochtml = os.path.join(htmldir, undocname + ".html")
            undochtmlunindexed = os.path.join(htmldir,
                                              undocname + ".unindexed.html")
            if os.path.isfile(undochtml) or os.path.isfile(undochtmlunindexed):
                continue
        undocnames.append(undocname)

    undocnames.sort()
    if IsNotQuiet():
        print "Preparing to parse %d files" % len(undocnames)

    for undocname in undocnames:
        undocpdfxml = os.path.join(pdfxmldir, undocname + ".xml")
        undochtml = os.path.join(htmldir, undocname +
                                 ".html")  # used to be ".unindexed.html"

        gparas = None
        lbeditparse = beditparse
        while not gparas:
            fin = open(undocpdfxml)
            xfil = fin.read()
            fin.close()

            if IsNotQuiet():
                print "parsing:", undocname,
            try:
                if lbeditparse:
                    lbeditparse = False
                    raise unexception("editparse", None)
                glueunfile = GlueUnfile(xfil, undocname)
                if not glueunfile.tlcall:
                    break  # happens when it's a bitmap type, or communique
                if IsNotQuiet():
                    print glueunfile.sdate  #, chairs
                gparas = GroupParas(glueunfile.tlcall, undocname,
                                    glueunfile.sdate,
                                    glueunfile.seccouncilmembers)

            except unexception, ux:
                assert not gparas
                if ux.description != "editparse":
                    if bcontinueonerror:
                        break
                    print "\n\nError: %s on page %s textcounter %s" % (
                        ux.description, ux.paranum.pageno,
                        ux.paranum.textcountnumber)
                print "\nHit RETURN to launch your editor on the pdfxml file (or type 's' to skip, or 't' to throw)"
                rl = sys.stdin.readline()
                if rl[0] == "s":
                    break
                if rl[0] == "t":
                    raise

                if ux.description != "editparse":
                    fin = open(undocpdfxml, "r")
                    finlines = fin.read()
                    fin.close()
                    mfinlines = re.match(
                        "(?s)(.*?<text ){%d}" % ux.paranum.textcountnumber,
                        finlines)
                    ln = mfinlines.group(0).count("\n")
                else:
                    ln = 1

                #editor = os.getenv('EDITOR')
                if sys.platform == "win32":
                    os.system('"C:\Program Files\ConTEXT\ConTEXT" %s /g00:%d' %
                              (undocpdfxml, ln + 2))
                else:
                    os.system('vim "%s" +%d' % (undocpdfxml, ln + 2))

        if not gparas:
            continue

        # actually write the file
        tmpfile = undochtml + "--temp"
        fout = open(tmpfile, "w")
        fout.write('<html>\n<head>\n')
        fout.write(
            '<link href="unview.css" type="text/css" rel="stylesheet" media="all">\n'
        )
        fout.write('</head>\n<body>\n')

        fout.write('\n<div class="heading" id="pg000-bk00">\n')

        sdate, stime = glueunfile.sdate[:10], glueunfile.sdate[10:].strip()
        fout.write(
            '\t<span class="code">%s</span> <span class="date">%s</span> <span class="time">%s</span>'
            % (undocname, sdate, stime))
        if gparas:
            fout.write('<span class="rosetime">%s</span>' %
                       gparas[-1].rosetime)

        fout.write('\n</div>\n')

        if glueunfile.bSecurityCouncil:
            fout.write('\n<div class="council-agenda" id="pg000-bk01">\n')
            fout.write(
                '\t<p class="boldline-p" id="pg000-bk01-pa01">%s</p>\n' %
                glueunfile.agenda)
            fout.write('</div>\n')
            fout.write('\n<div class="council-attendees" id="pg000-bk02">\n')
            ichairn = 0
            for chair in glueunfile.chairs:
                ichairn += 1
                fout.write('\t<p id="pg000-bk02-pa%02d">' % ichairn)
                for chperson in chair[0].split(
                        "/"
                ):  # just for the extremely rare case we get two people sharing the seat
                    fout.write('<span class="name">%s</span> ' %
                               chperson.strip())
                fout.write(
                    '<span class="nation">%s</span> <span class="place">%s</span></p>\n'
                    % (chair[1], chair[2]))
            fout.write('</div>')

        if glueunfile.bGeneralAssembly:
            fout.write('\n<div class="assembly-chairs" id="pg000-bk03">\n')
            ichairn = 0
            for chair in glueunfile.chairs:
                ichairn += 1
                fout.write(
                    '\t<p id="pg000-bk03-pa%02d"><span class="name">%s</span> <span class="nation">%s</span> <span class="place">president</span></p>\n'
                    % (ichairn, chair[0], chair[1]))
            fout.write('</div>\n')

        for gpara in gparas:
            gpara.writeblock(fout)

        # this for making the parsing a little easier
        fout.write('\n<div class="end-document" id="pg999-bk99">\n')
        fout.write('</div>\n')

        fout.write('\n</body>\n</html>\n')
        fout.close()
        if os.path.isfile(undochtml):
            os.remove(undochtml)
        os.rename(tmpfile, undochtml)
Exemplo n.º 18
0
def GroupParas(tlcall, undocname, sdate, seccouncilmembers):
    res = []
    i = 0
    currentspeaker = None
    curragendanum = ""
    while i < len(tlcall):
        tlc = tlcall[i]
        if re.match(recvoterequest, tlc.paratext):
            lblock = VoteBlock(tlcall, i, undocname, sdate, seccouncilmembers)
            i = lblock.i

        # non-voting line to be processed
        else:

            speakerbeforetookchair = ""
            if (len(res) > 2) and (res[-1].typ in [
                    "italicline-tookchair", "italicline-spokein"
            ]) and (res[-2].typ == "spoken"):
                speakerbeforetookchair = res[-2].speaker
                if res[-1].typ == "italicline-spokein":
                    assert len(res[-1].paragraphs) == 1
                    mspokein = re.search("spoke in (\w+)",
                                         res[-1].paragraphs[0][1])
                    if not mspokein:
                        if IsNotQuiet():
                            print "unrecognized spokein", res[-1].paragraphs
                    #print "converting spokein", speakerbeforetookchair[2], mspokein.group(1)
                    speakerbeforetookchair = (speakerbeforetookchair[0],
                                              speakerbeforetookchair[1],
                                              mspokein.group(1),
                                              speakerbeforetookchair[3])

            lblock = SpeechBlock(tlcall, i, undocname, sdate,
                                 speakerbeforetookchair, curragendanum)
            if lblock.agendanum:
                curragendanum = lblock.agendanum

            i = lblock.i

        if res and res[-1].paranum.pageno == lblock.paranum.pageno:
            lblock.paranum.blockno = res[-1].paranum.blockno + 1
        else:
            lblock.paranum.blockno = 1
        res.append(lblock)

    # find the rosetime
    if res:
        res[-1].rosetime = res[-1].ExtractRoseTime(sdate[10:].strip())
        if undocname in [
                "S-PV-3698", "S-PV-3698-Resu.1", "S-PV-3765-Resu.2",
                "S-PV-4072-Resu.1", "S-PV-4174", "S-PV-4223", "S-PV-5100"
        ]:
            assert not res[-1].rosetime
            res[-1].rosetime = sdate[10:].strip()  # the missing rosetimes
        if not res[-1].rosetime:
            if undocname == "A-62-PV.79":
                res[-1].rosetime = "06:05"
            else:
                res[-1].writeblock(sys.stdout)
                raise unexception("can't find rosetime", res[-1].paranum)

    return res
Exemplo n.º 19
0
def ScrapePDF(undocname, plenaryurl="http://www.un.org/ga/59/documentation/list0.html", purl=None, bforce=False):
    pdfname = undocname + ".pdf"
    pdffile = os.path.join(pdfdir, pdfname)
    if not bforce and os.path.isfile(pdffile):
        if IsNotQuiet():
            print "  skipping", pdffile, pdfname
        return True

    if not purl:
        mares = re.match("A-RES-(\d+)-(\d+)$", undocname)
        maresr = re.match("A-RES-(\d+)\((S-I|[IVXL]+)\)$", undocname)  # resolutions used to have sessions in roman numerals
        meres = re.match("E-RES-(\d\d\d\d)-(\d+)$", undocname)  # don't know what the code is
        madoc = re.match("A-(\d\d)-((?:L\.|CRP\.)?\d+)([\w\.\-\(\)]*)$", undocname)
        msres = re.match("S-RES-(\d+)\((\d+)\)$", undocname)
        mapv  = re.match("A-(\d\d)-PV.(\d+)(-Corr.\d|)$", undocname)
        macdoc = re.match("A-AC.(\d+)-(\d\d\d\d)-(\d+)$", undocname)
        maodoc = re.match("A-(\d+)(-?[\w\.\-]*)$", undocname)
        mspv = re.match("S-PV.(\d+)(?:-Resu\.(\d+))?$", undocname)
        scdoc = re.match("S-(\d\d\d\d)-(\d+)(-Corr.\d|)(\(SUPP\)|)$", undocname)
        mscodoc = re.match("S-(\d+)(-?[\w\.\-]*)$", undocname)
        #stdoc = re.match("ST-SGB-(\d+)$", undocname)  # experimental secretariat document
        dashdoc = re.match("ST-|A-C", undocname)
        munknown = re.match("(?:ECESA/1/Rev.1|S-26-2)$", undocname)
        mahrc = re.match("A-HRC(?:-S-(\d+))?-(\d[\w\.\-]*)$", undocname)
        mprst = re.match("S-PRST-(\d\d\d\d)-(\d+)$", undocname)

        if mares:
            if int(mares.group(1)) < 1:  # limit the sessions we take these resolutions from
                return False
            purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=A/RES/%s/%s&Lang=E" % (mares.group(1), mares.group(2))
        #if meres:
        #    purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=E/RES/%s/%s&Lang=E" % (meres.group(1), meres.group(2))
        elif maresr:
            if maresr.group(2) == "S-I":
                purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=A/RES/%s(S-1)&Lang=E" % (maresr.group(1))
            else:
                purl = "http://daccess-ods.un.org/access.nsf/Get?OpenAgent&DS=A/RES/%s(%s)&Lang=E&Area=RESOLUTION" % (maresr.group(1), maresr.group(2))

        elif dashdoc:
            # works for ST/SGB/...
            dashcode = re.sub("-", "/", undocname)
            #purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=ST/SGB/%s&Lang=E" % (stdoc.group(1))
            purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=%s&Lang=E" % dashcode

        elif madoc:
            if int(madoc.group(1)) < 1:  # limit the sessions we take these resolutions from
                return False
            tail = re.sub("-", "/", madoc.group(3))
            purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=A/%s/%s%s&Lang=E" % (madoc.group(1), madoc.group(2), tail)
            #print purl

        elif macdoc:
            purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=A/AC.%s/%s/%s&Lang=E" % (macdoc.group(1), macdoc.group(2), macdoc.group(3))

        elif scdoc:
            tail = re.sub("-", "/", scdoc.group(3))
            purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=S/%s/%s%s%s&Lang=E" % (scdoc.group(1), scdoc.group(2), tail, scdoc.group(4))
        elif mprst:
            purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=S/PRST/%s/%s&Lang=E" % (mprst.group(1), mprst.group(2))

        elif msres:
            sarea = int(msres.group(1)) <= 766 and "RESOLUTION" or "UNDOC"
            purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=S/RES/%s%%20(%s)&Lang=E&Area=%s" % (msres.group(1), msres.group(2), sarea)
            plenaryurl = "http://www.un.org/Docs/scres/2002/sc2002.htm"
        elif mspv:
            tail = mspv.group(2) and ("(Resumption%s)" % mspv.group(2)) or ""
            purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=S/PV.%s%s&Lang=E" % (mspv.group(1), tail)
            plenaryurl = "http://www.un.org/Docs/scres/2002/sc2002.htm"
        elif mapv:
            #if int(mapv.group(1)) < 40:  # limit the sessions we take these resolutions from
            #    return False
            tail = re.sub("-", "/", mapv.group(3))
            purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=A/%s/PV.%s%s&Lang=E" % (mapv.group(1), mapv.group(2), tail)
        elif maodoc:
            tail = re.sub("-", "/", maodoc.group(2))
            purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=A/%s%s&Lang=E" % (maodoc.group(1), tail)
            print "oldstyle doc", purl
        elif mscodoc:
            tail = re.sub("-", "/", mscodoc.group(2))
            purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=S/%s%s&Lang=E" % (mscodoc.group(1), tail)
            print "oldstyle doc", purl


        elif mahrc:
            tail = re.sub("-", "/", mahrc.group(2))
            if mahrc.group(1):
                purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=A/HRC/S-%s/%s&Lang=E" % (mahrc.group(1), tail)
            else:
                purl = "http://daccess-ods.un.org/access.nsf/Get?Open&DS=A/HRC/%s&Lang=E" % tail
            print "human rights council", purl

        elif meres or munknown:
            if IsNotQuiet():
                print "Unknown undocname", undocname
            return False
        else:
            if IsNotQuiet():
                print "Unrecognized undocname", undocname
            return False
    else:
        purl = re.sub("\s", "", purl)
        purl = re.sub("&amp;", "&", purl)

    #print "$$%s$$" % purl
    if IsNotQuiet():
        print " scraping", undocname,
    if not purl:
        print "*** Need to make"
        return False

    ##return False

    # first go through the forwarding blocker
    purl = urlparse.urljoin(plenaryurl, purl)

    try:
        if IsNotQuiet():
            print purl
        plenarypdf = GetFromNet(undocname, purl, plenaryurl)
        if not plenarypdf:
            purlsupp = re.sub("&Lang=E", "(SUPP)&Lang=E", purl)
            if purlsupp != purl:
                plenarypdf = GetFromNet(undocname, purlsupp, plenaryurl)
                #http://daccess-ods.un.org/access.nsf/Get?Open&DS=A/61/5/Add.1(SUPP)&Lang=E

    except KeyboardInterrupt, e:
        print "\n *** Keyboard Interrupt"
        sys.exit(1);
Exemplo n.º 20
0
    def ExtractSeccounFrontPage(self, txlines):
        self.date = None
        self.chairs = []
        self.seccouncilmembers = []
        self.agenda = []

        lasttop = -1
        jtxlines = []
        ih = 0
        while ih < len(txlines):
            if txlines[ih].top == lasttop:
                jtxlines[-1] = "%s %s" % (jtxlines[-1], txlines[ih].ltext)
            else:
                jtxlines.append(txlines[ih].ltext)
                lasttop = txlines[ih].top
            ih += 1

        del txlines  # just deletes the reference to this object
        ih = 0
        while ih < len(jtxlines):
            self.ExtractDateTime(None, jtxlines[ih])
            mpresseat = re.match(
                "<i>(President|Chairman|later)(?:</i>:|:\s*</i>)\s*((?:Mr.|Mrs.|Ms.|Sir\.?|Miss|Sheikh|Baroness|Lord|Nana) .*?)\s+\.(?: \.)*\s*(\(.*)?$",
                jtxlines[ih])
            #print jtxlines[ih], mpresseat
            if mpresseat:
                if not self.date:
                    if IsNotQuiet():
                        for i in range(ih):
                            print jtxlines[i]
                    raise unexception(
                        "missingg date",
                        paranumC(self.undocname, None, 0, -1,
                                 self.textcountnumber))
                if mpresseat.group(1) in ["President", "Chairman"]:
                    assert len(self.chairs) == 0  # first one
                else:
                    assert len(self.chairs) == 1  # later president
                ih += 1
                if mpresseat.group(3):
                    scountry = mpresseat.group(3)
                else:
                    scountry = ""
                if re.search("\(", scountry) and not re.search("\)", scountry):
                    scountry = "%s %s" % (scountry, jtxlines[ih])
                    ih += 1
                mcountry = re.match("\((.*?)\)$", scountry)
                lfscountry = re.sub("\s+", " ", mcountry.group(1))
                fscountry = FixNationName(lfscountry, self.date)
                if not fscountry:
                    if IsNotQuiet():
                        print "--%s--" % mcountry.group(1)
                    raise unexception(
                        "unrecognized nationA",
                        paranumC(self.undocname, None, 0, -1,
                                 self.textcountnumber))
                chairname = re.sub("\s\s+", " ", mpresseat.group(2)).strip()
                self.chairs.append((chairname, fscountry, "president"))

                if fscountry in self.seccouncilmembers:
                    assert len(self.seccouncilmembers) == 1
                    assert fscountry == "New Zealand"
                    assert self.undocname == "S-PV-3370"
                    assert len(self.chairs) == 2
                    del self.chairs[0]
                    del self.seccouncilmembers[0]

                self.seccouncilmembers.append(fscountry)
                continue

            mcountryseat = re.match(
                "(<i>Members(?:</i>:|:\s*</i>))?\s*([\w\-\s]*?)\s*\.(?: \.)*\s*((?:Mr.|Ms.|Mrs.|Miss|Dr.|Sir\.?|Sheikh|Baroness|Lord|Nana) [^<>]*|absent)$",
                jtxlines[ih])
            if mcountryseat:
                if mcountryseat.group(1):
                    if len(self.chairs) not in [
                            1, 2
                    ]:  # in case of second president
                        if IsNotQuiet():
                            print self.chairs, "chchchch"
                        raise unexception(
                            "chairs not thereB",
                            paranumC(self.undocname, None, 0, -1,
                                     self.textcountnumber))
                else:
                    if len(self.chairs) == 0:
                        if not self.date:  # prob a closed meeting
                            break
                        if IsNotQuiet():
                            print ih, jtxlines[ih]
                        raise unexception(
                            "seat without chair",
                            paranumC(self.undocname, None, 0, -1,
                                     self.textcountnumber))
                lfscountry = re.sub("\s+", " ", mcountryseat.group(2))
                fscountry = FixNationName(lfscountry, self.date)
                if not fscountry:
                    if IsNotQuiet():
                        print "--%s--" % mcountryseat.group(2)
                    raise unexception(
                        "unrecognized nationB",
                        paranumC(self.undocname, None, 0, -1,
                                 self.textcountnumber))
                chairname = re.sub("\s\s+", " ", mcountryseat.group(3)).strip()
                self.chairs.append((chairname, fscountry, "member"))
                if fscountry not in self.seccouncilmembers:
                    self.seccouncilmembers.append(fscountry)
                else:
                    if IsNotQuiet():
                        print "Repeat-country on council", fscountry
            else:
                if re.search(" \. \. \. \. \. \. ", jtxlines[ih]):
                    if IsNotQuiet():
                        print "--%s--" % jtxlines[ih]
                    raise unexception(
                        "missing country",
                        paranumC(self.undocname, None, 0, -1,
                                 self.textcountnumber))
            if re.match("<b>Agenda\s*</b>$", jtxlines[ih]):
                ih += 1
                break
            if re.search("Agenda", jtxlines[ih]):
                print ih, jtxlines
                raise unexception(
                    "unextracted Agenda (should be <b>?)",
                    paranumC(self.undocname, None, 0, -1,
                             self.textcountnumber))
            ih += 1

        # could be a closed meeting
        if not self.date:
            alltext = " ".join(jtxlines)
            if re.search(
                    "OFFICIAL COMMUNIQU..*?Held in private (?:in the Security Council Chamber )?at Headquarters(?i)",
                    alltext):
                return False
            return True

        while ih < len(jtxlines):
            if re.match("\d\d-\d\d", jtxlines[ih]):
                break
            if re.match("\d\d.?\d\d\d\d\d \(E\)", jtxlines[ih]):
                break
            if re.match(
                    "This record contains the text of speeches delivered in English",
                    jtxlines[ih]):
                break
            #print "agagag", jtxlines[ih]
            assert not re.search("text of speeches|verbatim(?i)", jtxlines[ih])
            self.agenda.append(jtxlines[ih].strip())
            ih += 1

        #print "ccccc", self.chairs
        lparanum = paranumC(self.undocname, None, 0, -1, self.textcountnumber)
        if len(self.chairs) not in (15,
                                    17) or len(self.seccouncilmembers) != 15:
            if self.undocname == "S-PV-3446":
                return False
            if IsNotQuiet():
                print len(self.seccouncilmembers), len(
                    self.chairs
                ), "wrong number of members or chairs\n", self.chairs
                print self.seccouncilmembers
            raise unexception("wrongnumber on council", lparanum)

        self.agenda = " ".join(self.agenda)
        self.agenda = re.sub("</?b>", " ", self.agenda)
        self.agenda = re.sub("\s\s+", " ", self.agenda)
        self.agenda = MarkupLinks(
            CleanupTags(self.agenda, "council-agenda", lparanum),
            self.undocname, lparanum)
        return True
Exemplo n.º 21
0
    def __init__(self, xpage, lundocname, lpageno, textcountnumber):
        self.pageno = lpageno
        self.undocname = lundocname
        self.textcountnumber = textcountnumber
        self.bSecurityCouncil = re.match("S-PV.(\d+)", self.undocname)
        self.nSecurityCouncilSession = self.bSecurityCouncil and int(
            self.bSecurityCouncil.group(1)) or 0
        self.bGeneralAssembly = re.match("A-\d+-PV", self.undocname)
        assert self.bSecurityCouncil or self.bGeneralAssembly

        # for right column, if not left justified, this adds a bit more to the right
        if self.bGeneralAssembly and int(
                re.match("A-(\d+)", lundocname).group(1)) <= 52:
            rightcolstartindentincrement = 1
        else:
            rightcolstartindentincrement = 0

        # set the column starts from some of the special cases we get
        leftcolstart = 90
        if self.bGeneralAssembly and int(
                re.match("A-(\d+)", lundocname).group(1)) <= 54:
            rightcolstart = 481
        else:
            rightcolstart = 468

        if lundocname in [
                "A-54-PV.100", "A-54-PV.96", "A-54-PV.98", "A-54-PV.99",
                "S-PV-4143", "S-PV-4143-Resu.1"
        ]:
            rightcolstart = 468
        elif lundocname in ["A-54-PV.97"]:
            rightcolstart = 486
        elif re.match("S-PV-335[0-8]", lundocname):
            rightcolstart = 468
        elif re.match("S-PV-334", lundocname):
            rightcolstart = 468
        elif self.nSecurityCouncilSession >= 4144:
            rightcolstart = 468

        #re.match("S-PV-414[4-9]", lundocname):
        #    rightcolstart = 468
        #elif re.match("S-PV-41[5-9]", lundocname):
        #    rightcolstart = 468
        #elif re.match("S-PV-4[2-9]", lundocname):
        #    rightcolstart = 468
        #elif re.match("S-PV-5", lundocname):
        #    rightcolstart = 468

        elif self.bSecurityCouncil:
            rightcolstart = 481
            rightcolstartindentincrement = 1

        # generate the list of lines, sorted by vertical position
        ftxlines = re.findall("<text.*?</text>", xpage)

        txlines = []
        for txline in ftxlines:
            txl = TextLine(txline, lundocname, lpageno, self.textcountnumber)
            self.textcountnumber += 1
            if txl.ltext:
                if txlines and txlines[-1].bfootertype and txlines[
                        -1].top == txl.top:
                    txl.bfootertype = True
                txlines.append(txl)
        txlines.sort(key=TextLineTopKey)

        # the half divider is at 459

        # try to separate out the header and footers
        if self.pageno == 1 and self.bGeneralAssembly:
            ih = self.ExtractDotLineChairHead(txlines)
            #for Dtxl in txlines[-10:]:
            #    print Dtxl.top, Dtxl.left, Dtxl.ltext

            ie = len(txlines) - 1
            while txlines[ie].bfootertype:
                #print "FOOTER:", txlines[ie].ltext
                ie -= 1
            #print "**NON-FOOTER:", txlines[ie].ltext
            ie += 1

            # the whole first page gets parsed separately
            assert not self.bSecurityCouncil

        elif self.bSecurityCouncil and self.pageno == 1:
            if not self.ExtractSeccounFrontPage(txlines):
                self.bSecurityCouncil = "ClosedSession"
            return

        # special case where the agenda spills to a second page (don't forget the outer application of this if)
        elif self.bSecurityCouncil and lundocname in twopageagendas and self.pageno == 2:
            ih = 0
            self.agenda = []
            while ih < len(txlines):
                if 132 <= txlines[ih].top < 1000:
                    self.agenda.append(txlines[ih].ltext)
                ih += 1
            self.agenda = " ".join(self.agenda)
            self.agenda = re.sub("</?b>", " ", self.agenda)
            self.agenda = re.sub("\s\s+", " ", self.agenda)
            lparanum = paranumC(self.undocname, None, 0, -1,
                                self.textcountnumber)
            self.agenda = MarkupLinks(
                CleanupTags(self.agenda, "council-agenda", lparanum),
                self.undocname, lparanum)
            return

        elif self.bGeneralAssembly:
            if re.match("<b>\w[/.]\d+/PV.\d+\s*</b>", txlines[0].ltext):
                ih = 1
            elif re.match("\d", txlines[0].ltext) and re.match(
                    "<b>\w[/.]\d+/PV.\d+\s*</b>", txlines[1].ltext):
                ih = 2
            else:
                #print txlines[0].ltext
                assert re.match("General Assembly",
                                txlines[0].ltext), txlines[0].ltext
                assert re.match("\d+(?:th|st|nd|rd) (?:plenary )?meeting",
                                txlines[1].ltext)
                assert re.match("\S+ [Ss]ession", txlines[2].ltext)
                assert re.match("\d+ \w+ \d\d\d\d", txlines[3].ltext) or (
                    lundocname in ["A-50-PV.38", "A-50-PV.40"])
                ih = 4
            ie = len(txlines) - 1
            if re.match("\d\d\-\d\d\d\d\d", txlines[ie].ltext):
                ie -= 1
            pagenumtext = re.sub("<..?>", "", txlines[ie].ltext).strip()
            if re.match("\d\d\-\d\d\d\d\d", txlines[ie - 1].ltext):
                ie -= 1
            if not re.match("\d+$", pagenumtext):
                if IsNotQuiet():
                    print "jjjj", pagenumtext, txlines[ie].ltext
                raise unexception(
                    "pagenum error not a number",
                    paranumC(self.undocname, None, 0, -1,
                             txlines[ie].textcountnumber))
            if int(pagenumtext) != self.pageno:
                if IsNotQuiet():
                    print pagenumtext, self.pageno
                raise unexception(
                    "pagenum serror of speaker-intro",
                    paranumC(self.undocname, None, 0, -1,
                             txlines[ie].textcountnumber))

        elif self.bSecurityCouncil:
            #if len(txlines) < 4:
            #    raise unexception("intro too short", paranumC(self.undocname, None, 0, -1, txlines[0].textcountnumber))

            bl0 = len(txlines) > 4 and re.match("Security Council",
                                                txlines[0].ltext)
            bl1 = len(txlines) > 4 and re.match(
                "\d+(?:th|st|nd|rd)? (?:\(Resumption(?: \d)?\) )?(?:meeting)?",
                txlines[1].ltext)
            bl2 = len(txlines) > 4 and re.match("(\w+-\w+|\w+) [Yy]ear",
                                                txlines[2].ltext)
            bl3 = len(txlines) > 4 and re.match("\d+ \w+ \d\d\d\d",
                                                txlines[3].ltext)

            bl4 = re.match(
                "<b>S/PV.\d+\s*(?:\(Resumption [\d|I]\)|\(Part [I]+\))?\s*</b>",
                txlines[0].ltext)
            bl4r = (self.undocname[5:] >= "4143")

            if bl4 and bl4r:
                ih = 1
            elif bl0 and bl1 and bl2 and bl3:
                ih = 4
            else:
                if IsNotQuiet():
                    print "\nFirst four lines on page:", self.pageno, bl4, bl4r
                    print bl0, txlines[0].ltext
                    print bl1, txlines[1].ltext
                    print bl2, txlines[2].ltext
                    print bl3, txlines[3].ltext
                    print bl4, bl4r
                raise unexception(
                    "bad page header",
                    paranumC(self.undocname, None, 0, -1,
                             txlines[0].textcountnumber))

            ie = len(txlines) - 1
            if re.match("\d\d\-\d\d\d\d\d", txlines[ie].ltext):
                ie -= 1
            pagenumtext = txlines[ie].ltext
            mpagenumtext = re.match("(?:<b>)?(\d+)\s*(?:</b>)?$", pagenumtext)
            if not mpagenumtext:
                if IsNotQuiet():
                    print "jkjk", pagenumtext
                raise unexception(
                    "pagenum error not a number",
                    paranumC(self.undocname, None, 0, -1,
                             txlines[ie].textcountnumber))
            pgoffset = int(mpagenumtext.group(1)) - self.pageno
            if pgoffset != 0 and self.undocname not in misnumberedpages:
                if IsNotQuiet():
                    print "pagenum-offset not in list", self.undocname, mpagenumtext.group(
                        1), self.pageno
                raise unexception(
                    "page pagenum error of speaker-intro",
                    paranumC(self.undocname, None, 0, -1,
                             txlines[ie].textcountnumber))
            if re.match("\d\d-\d\d\d\d\d$", txlines[ie - 1].ltext):
                ie -= 1

        else:
            assert False

        # separate out the header and footers
        self.txlheader = txlines[:ih]
        self.txlfooter = txlines[ie:]

        # separate the body into the two columns
        self.txlcol1 = []
        self.txlcol2 = []
        self.minindentleft = 9999
        self.minindentright = 9999
        for txl in txlines[ih:ie]:
            if txl.left < 459:
                #print txl.bfootertype, txl.left, txl.width, txl.top, txl.ltext  # zzzz
                # there's a bit of spilling out where the region is larger than it should be for the words as in A-56-PV.64
                if not (txl.left + txl.width <= 459):
                    if txl.left + txl.width > 501:
                        if IsNotQuiet():
                            print txl.left, txl.width, txl.left + txl.width
                            print txl.ltext
                            print "might have page no. 1 on first page (or add to twopageagendas)"
                        raise unexception(
                            "right-hand extension excessive",
                            paranumC(txl.undocname, None, 0, -1,
                                     txl.textcountnumber))
                    if not (txl.left <= 165):
                        bc = -1
                        while True:
                            assert self.txlcol1[-1].txls[
                                bc].top == txl.top  # in-line but shorter
                            if (self.txlcol1[-1].txls[bc].left <= 165):
                                break
                            bc -= 1

                txl.indent = txl.left - leftcolstart
                if txl.indent < 0:
                    if IsNotQuiet():
                        print txl.indent, txl.ltext
                    raise unexception(
                        "negative indentation",
                        paranumC(txl.undocname, None, 0, -1,
                                 txl.textcountnumber))
                self.minindentleft = min(txl.indent, self.minindentleft)
                txl.brightcol = False
                AppendToCluster(self.txlcol1, txl)

            else:
                txl.indent = txl.left - rightcolstart
                if txl.indent != 0:
                    txl.indent += rightcolstartindentincrement
                if txl.indent < 0:
                    if IsNotQuiet():
                        print txl.indent, txl.left, rightcolstart
                        print txl.ltext
                    raise unexception(
                        "negative indent on righthand column",
                        paranumC(self.undocname, None, 0, -1,
                                 self.textcountnumber))
                self.minindentright = min(txl.indent, self.minindentright)
                txl.brightcol = True
                AppendToCluster(self.txlcol2, txl)
Exemplo n.º 22
0
def ScrapeSCContentsPage(year, contentsurl):
    if IsNotQuiet():
        print "URL index:", contentsurl
    fin = urllib2.urlopen(contentsurl)
    scindex = fin.read()
    fin.close()

    reslist = [ ]
    pvlist = [ ]
    prstlist = [ ]
    scdoclist = [ ]
    pvcorrlist = [ ]

    # this gets everything except the press releases in the middle column
    scindexlist = re.findall('<a.[^>]*?href=\s*"(http://daccess[^"]*)"[^>]*>\s*(?:<font size="2">)?(.*?)(?:<br>\s*)?</a>(?is)', scindex)

    for sci in scindexlist:
        #print sci[1]
        # communique for an embargoed verbatim recording
        if re.match("Communiqu.", sci[1]):
            pvlm12 = re.sub("&amp;", "&", pvlist[-1][2])
            if sci[0] != pvlm12 and sci[0] != pvlist[-1][2] and not re.search("PV.5794|PV.5906", sci[0]):
                print "Communique doesn't have same link as it should:\n%s\n%s" % (sci[0], pvlist[-1][2])  # same link
            continue

        # security council resolutions
        scres = re.match("S/RES/(\d+)\s*\((\d+)\)\s*$", sci[1])
        if scres:
            reslist.append((-int(scres.group(1)), scres, sci[0]))
            continue

        # verbatim recordings
        scpv = re.match("S/PV\.(\d+)(?:\s*<br>)?(?:\s*\((Resumption|Part)\s*([\dI]*)\))?\s*(?:\(closed\))?$", sci[1])
        if scpv:
            pvlist.append((-int(scpv.group(1) or "1"), scpv, sci[0]))
            #print scpv.group(0)
            continue

        # corrigenda, which happens to the verbatim transcripts
        sccorr = re.match("Corr\.(\d+)\s*", sci[1])
        if sccorr:
            urlwithoutcorr = re.sub("/Corr\.\d+(?i)", "", sci[0])
            if pvlist[-1][2] != urlwithoutcorr and IsNotQuiet():
                print pvlist[-1][2]
                print urlwithoutcorr
                if year == 1998 and re.search("PV\.3896", pvlist[-1][2]) and re.search("PV\.3986", urlwithoutcorr):
                    print "  --- known typo"
                elif year == 1995 and re.search("PV\.3528", pvlist[-1][2]) and re.search("PV\.3611", urlwithoutcorr):
                    print "  --- known typo"
                elif year == 2008 and re.search("PV\.5916", pvlist[-1][2]) and re.search("PV\.5916", urlwithoutcorr):
                    print "  --- known unconsolidated typo"
                else:
                    print year, pvlist[-1], sci
                    assert False
            pvcorrlist.append((pvlist[-1][1], sccorr.group(1), sci[0]))
            continue

        # presidential statements
        scprst = re.match("S/PRST/(\d+)/(\d+)\s*$", sci[1])
        if scprst:
            assert int(scprst.group(1)) == year
            prstlist.append((-int(scprst.group(2)), scprst, sci[0]))
            continue

        # security council documents (usually a failed resolution)
        scdoc = re.match("\(?S/(\d+)/(\d+)\)?\s*$", sci[1])
        if scdoc:
            assert int(scdoc.group(1)) == year
            scdoclist.append((-int(scdoc.group(2)), scdoc, sci[0]))
            continue

        # known typo link
        if re.match("<a>", sci[1]):
            assert sci[0] == pvlist[-1][2]  # same link
            continue

        if IsNotQuiet():
            print "Unrecognized link type", "$$%s$$" % sci[1]
        assert False

    # sort and scrape all the presidential statements
    prstlist.sort()
    for i in range(1, len(prstlist)):
        if -prstlist[i - 1][0] - 1 != -prstlist[i][0] and IsNotQuiet():
            print "presidential statement missing between ", -prstlist[i - 1][0], "and", -prstlist[i][0]
            if (year, -prstlist[i - 1][0],  -prstlist[i][0]) in [(2000, 28, 26), (1996, 11, 9), (1996, 4, 2), (1995, 57, 55), (1995, 37, 35), (1995, 15, 13), (1995, 4, 2), (1994, 77, 75), (1994, 50, 48), (1994, 42, 39), (1994, 25, 23), (1994, 19, 17), (1994, 4, 2)]:
                print "  -- known missing statement"
            else:
                assert False
    for (i, prst, prsturl) in prstlist:
        ScrapePDF("S-PRST-%s-%s" % (prst.group(1), prst.group(2)), plenaryurl=contentsurl, purl=prsturl)

    # now sort and scrape all the verbatims
    pvlist.sort()
    for i in range(1, len(pvlist)):
        #print pvlist[i - 1][1].group(2), pvlist[i - 1][1].group(3)
        if -pvlist[i - 1][0] == -pvlist[i][0]:
            if pvlist[i - 1][1].group(2) == "Resumption":
                resum = int(pvlist[i][1].group(3) or "0")
                if not pvlist[i - 1][1].group(2):
                    if IsNotQuiet():
                        print "rrr", pvlist[i - 1][1].group(0) # there must be a resumption number
                if not pvlist[i - 1][1].group(3):
                    resumP = 1
                elif pvlist[i - 1][1].group(3) == "I":
                    resumP = 1
                else:
                    resumP = int(pvlist[i - 1][1].group(3))
                assert resumP == resum + 1
            else:
                if IsNotQuiet():
                    print "slslsl", pvlist[i - 1][1].group(2), pvlist[i][1].group(2)
        elif -pvlist[i - 1][0] - 1 != -pvlist[i][0]:
            if IsNotQuiet():
                print "verbatim report missing between ", -pvlist[i - 1][0], "and", -pvlist[i][0]
                assert False
    for (i, scpv, scpvurl) in pvlist:
        resumppart = ""
        if scpv.group(2) == "Resumption":
            if scpv.group(3) == "I":
                resnum = 1
            else:
                resnum = int(scpv.group(3))
            resumppart = "-Resu.%d" % resnum
        elif scpv.group(2) == "Part":
            if scpv.group(3) == "I":
                pn = "1"
            elif scpv.group(3) == "II":
                pn = "2"
            else:
                if IsNotQuiet():
                    print "asspv", scpv.group(0), scpv.group(3)
            resumppart = "-Part.%s" % pn
        ScrapePDF("S-PV-%s%s" % (scpv.group(1), resumppart), plenaryurl=contentsurl, purl=scpvurl)

    # do corrigendas
    for (scpv, pvcorr, pvcorrurl) in pvcorrlist:
        ScrapePDF("S-PV-%s%s-Corr.%s" % (scpv.group(1), (scpv.group(2) and ("-Resu.%s" % scpv.group(3)) or ""), pvcorr), plenaryurl=contentsurl, purl=pvcorrurl)

    # now sort and scrape all the resolutions
    reslist.sort()
    for i in range(1, len(reslist)):
        if -reslist[i - 1][0] - 1 != -reslist[i][0]:
            if IsNotQuiet():
                print "resolution missing between ", -reslist[i - 1][0], "and", -reslist[i][0]
                assert False
    for (i, scres, scresurl) in reslist:
        ScrapePDF("S-RES-%s(%s)" % (scres.group(1), scres.group(2)), plenaryurl=contentsurl, purl=scresurl)
Exemplo n.º 23
0
    def __init__(self, xfil, undocname):
        self.sdate = None
        self.chairs = None
        self.agenda = None
        self.tlcall = None
        self.seccouncilmembers = None
        self.bSecurityCouncil = re.match("S-PV.\d+", undocname)
        self.bGeneralAssembly = re.match("A-\d+-PV", undocname)

        xpages = StripPageTags(xfil, undocname)
        if not xpages:
            return  # bitmap type encountered
        txpages = []
        self.tlcall = []

        for i in range(len(xpages)):
            txpage = TextPage(xpages[i], undocname, i + 1, (txpages or 0)
                              and txpages[-1].textcountnumber)
            if i == 0 and txpage.bSecurityCouncil == "ClosedSession":
                if IsNotQuiet():
                    print " -- closedsession"
                self.tlcall = None
                return  # closed session encountered
            txpages.append(txpage)

            if txpage.bSecurityCouncil and i == 0:
                continue

            # special cases of agenda overflowing into two pages
            if txpage.bSecurityCouncil and i == 1 and undocname in twopageagendas:
                txpages[0].agenda = "%s %s" % (
                    txpages[0].agenda, txpage.agenda
                )  # ram it all into one paragraph (who cares)
                continue

            bmissingcolumns = undocname in ["A-61-PV.106", "A-52-PV.39"]
            if txpage.txlcol1:
                AppendCluster(self.tlcall, txpage.txlcol1[0], "newpage")
                for tlc in txpage.txlcol1[1:]:
                    AppendCluster(self.tlcall, tlc, "gapcluster")
            elif not bmissingcolumns:
                #assert i == len(xpages) - 1  # only last page can have missing columns (sometimes it's the first)
                print "page", i, "of", len(xpages)
                #print txpages[-1].textcountnumber
                raise unexception(
                    "missing column not on last page",
                    paranumC(undocname, None, 0, -1,
                             txpages[-1].textcountnumber))

            # have had a case where the first column was the blank one
            if txpage.txlcol2:
                AppendCluster(self.tlcall, txpage.txlcol2[0], "newcolumn")
                for tlc in txpage.txlcol2[1:]:
                    AppendCluster(self.tlcall, tlc, "gapcluster")
            elif not bmissingcolumns:
                assert i == len(xpages) - 1, "%d != %d" % (i, len(xpages) - 1)

        # assign ids to the clusters
        self.sdate = txpages[0].date
        paranumlast = paranumC(undocname, self.sdate, 0, -1, 0)
        for tlc in self.tlcall:
            if tlc.txls[0].pageno == paranumlast.pageno:
                paranumlast = paranumC(undocname, self.sdate,
                                       paranumlast.pageno,
                                       paranumlast.paragraphno + 1,
                                       tlc.txls[0].textcountnumber)
            else:
                paranumlast = paranumC(undocname, self.sdate,
                                       tlc.txls[0].pageno, 1,
                                       tlc.txls[0].textcountnumber)
            tlc.paranum = paranumlast

        # merge the lines together and remove double bold/italics that happen across lines
        for tlc in self.tlcall:
            jparatext = []  # don't insert spaces where there is a hyphen
            for txl in tlc.txls:
                if jparatext and not (re.search("\w[-/]$", jparatext[-1])
                                      and re.match("\w", txl.ltext)):
                    jparatext.append(" ")
                jparatext.append(txl.ltext)
            tlc.paratext = "".join(jparatext)

            tlc.paratext = re.sub("-</i> <i>", "-", tlc.paratext)
            tlc.paratext = re.sub("-</b> <b>", "-", tlc.paratext)
            tlc.paratext = re.sub("</b>\s*\.\s*<b>", ". ", tlc.paratext)
            tlc.paratext = re.sub("Secretary- General", "Secretary-General",
                                  tlc.paratext)
            tlc.paratext = re.sub(
                "\s*(?:</i>\s*<i>|</b>\s*<b>|<b>\s*</b>|<i>\s*</i>|<b>\s*<i>\s*</b>\s*</i>)\s*",
                " ", tlc.paratext)
            tlc.paratext = tlc.paratext.strip()

            tlc.paratext = re.sub(
                "^<b>(The(?: Acting)? Co-Chairperson) \(([^\)]*)\)\s*(?:</b>\s*:|:\s*</b>)",
                "<b>\\1</b> (\\2):", tlc.paratext)
            tlc.lastindent = tlc.indents[-1][0]

        self.agenda = txpages[0].agenda
        self.chairs = txpages[0].chairs
        if self.bSecurityCouncil:
            self.seccouncilmembers = txpages[0].seccouncilmembers
Exemplo n.º 24
0
def GetFromNet(undocname, purl, plenaryurl):
    req = urllib2.Request(purl)
    req.add_header('Referer', plenaryurl)
    fin = urllib2.urlopen(req)
    plenrefererforward = fin.read()
    fin.close()
    mfore = re.search('URL=([^"]*)', plenrefererforward)
    if not mfore:
        if undocname == "A-55-PV.26":   # claims to be embargoed
            if IsNotQuiet():
                print "broken", pdfname
            return False
        if re.search("There is no document", plenrefererforward):
            #print "no-document"
            return False
        if re.search("This document is under EMBARGO", plenrefererforward):
            if IsNotQuiet():
                print "*** EMBARGOED ***"
            return False
        if re.search("The distribution of the document is to hight", plenrefererforward):
            if IsNotQuiet():
                print "*** TO HIGHT ***"
            return False
        if not IsNotQuiet():  # bail out without error
            return False
        print "plplplpl", plenrefererforward
        assert False
    turl = urlparse.urljoin(purl, mfore.group(1))
    # pull in the login url, containing another forward, and a page which gives the cookies
    fin = urllib2.urlopen(turl)
    plenarycookielink = fin.read()
    fin.close()

    #<META HTTP-EQUIV="refresh" CONTENT="1; URL=http://daccessdds.un.org/doc/UNDOC/GEN/N02/596/08/PDF/N0259608.pdf?OpenElement">
    #<frame name="footer" scrolling="no" noresize target="main" src="http://daccessdds.un.org/prod/ods_mother.nsf?Login&Username=freeods2&Password=1234" marginwidth="0" marginheight="0">

    # extract pdf link
    mpdf = re.search('URL=([^"]*)', plenarycookielink)
    if not mpdf:
        if not IsNotQuiet():  # bail out without error
            return False
        print "pcpcpcpc", plenarycookielink
    plenarypdfurl = urlparse.urljoin(turl, mpdf.group(1))

    # extract cookie link
    mcook = re.search('src="(http://daccessdds.un.org/[^"]*)', plenarycookielink)
    if not mcook:
        if not IsNotQuiet():  # bail out without error
            return False
        print "plplsplspl", plenarycookielink
    plenarycookurl = urlparse.urljoin(turl, mcook.group(1))

    # take the cookies from the cookie link
    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    fin = opener.open(plenarycookurl)
    fin.close()

    if IsNotQuiet():
        print plenarypdfurl[-30:]

    # put them into the pdf link
    fin = opener.open(plenarypdfurl)
    plenarypdf = fin.read()
    fin.close()

    return plenarypdf
Exemplo n.º 25
0
    def __init__(self, tlcall, i, lundocname, lsdate, seccouncilmembers):
        self.tlcall = tlcall
        self.i = i
        self.sdate = lsdate
        self.undocname = lundocname
        self.bSecurityCouncil = re.match("S-PV.\d+", self.undocname)
        self.bGeneralAssembly = re.match("A-\d+-PV", self.undocname)
        assert self.bGeneralAssembly or self.bSecurityCouncil
        if not self.bSecurityCouncil:
            seccouncilmembers = None

        self.pageno, self.paranum = tlcall[i].txls[0].pageno, tlcall[i].paranum

        vtext = re.sub("</?i>", "", tlcall[self.i].paratext).strip()
        if self.bGeneralAssembly and re.match(
                "A recorded vote has been requested(?: for this item| on (?:the|this) motion|\. We shall now begin the voting process)?\.?$",
                vtext):
            self.i += 1
            vtext = re.sub("</?i>", "", tlcall[self.i].paratext).strip()
        if self.bGeneralAssembly and re.match(
                "A recorded vote was taken\s*\.?$", vtext):
            self.i += 1
        if self.bSecurityCouncil and re.match(
                "A vote was taken(?: by (?:a )?show of hands)?.$", vtext):
            self.i += 1

        if not (self.i != i or self.undocname
                in ["A-55-PV.86", "A-50-PV.90", "A-49-PV.90"]):
            print "--%s--" % tlcall[self.i - 1].paratext
            if not re.match("<i>", tlcall[self.i - 1].paratext):
                print "  --[should this line be italic?]"
            print tlcall[self.i].paratext
            raise unexception("requested vote not followed through",
                              tlcall[self.i].paranum)

        self.vlfavour = self.DetectVote("<i>In favour:?\s*</i>:?")
        self.vlagainst = self.DetectVote("(?:<i>)?Against:?\s*(?:</i>)?:?")
        self.vlabstain = self.DetectVote("(?:<i>)?Abstaining:?(?:</i>)?:?")
        gnv, self.vlabsent = GenerateNationsVoteList(self.vlfavour,
                                                     self.vlagainst,
                                                     self.vlabstain,
                                                     self.sdate, self.paranum,
                                                     seccouncilmembers)
        self.votecount = "favour=%d against=%d abstain=%d absent=%d" % (len(
            self.vlfavour), len(self.vlagainst), len(
                self.vlabstain), len(self.vlabsent))
        if IsNotQuiet():
            print "  ", self.votecount
        if self.bGeneralAssembly:
            self.DetectAdoption()
            self.DetectSubsequentVoteChange(gnv)
        if self.bSecurityCouncil:
            self.motiontext = ""
            self.DetectDidnotparticipate(gnv, self.vlabsent)

        #res = [ '\t\t<div style="border:1px solid black; margin-left:2em"><b>VOTE ', votecount, "</b><br>\n", "\t\t<i>", self.motiontext, "</i>\n" ]
        #res.append('\t\t<div style="font-size:6">')
        lvotelist = []
        for nation, vote in sorted(gnv.items()):
            lvotelist.append('<span class="%s">%s</span>' % (vote, nation))
        self.votelist = ", ".join(lvotelist)
        #res.append("</div></div>\n")
        #self.parafout = "".join(res)
        self.typ = "vote"
Exemplo n.º 26
0
def process_file(pfnameunindexed, xapian_db):
    mdocid = re.match(r".*?(html[\\/])([\-\d\w\.]+?)(\.unindexed)?(\.html)$", pfnameunindexed)
    assert mdocid, "unable to match: %s" % pfnameunindexed
    document_id = mdocid.group(2)

    fin = open(pfnameunindexed)
    doccontent = fin.read()
    fin.close()

    mdocument_date = re.search('<span class="date">(\d\d\d\d-\d\d-\d\d)</span>', doccontent)
    assert mdocument_date, "not found date in file %s" % pfnameunindexed
    document_date = mdocument_date.group(1)

    if IsNotQuiet():
        print "indexing %s %s" % (document_id, document_date)

    while delete_all_for_doc(document_id, xapian_db):
        pass   # keep calling delete until all clear

    # Loop through each speech, and batch up the headings so they can be updated with the correct info
    xapian_doc_heading = None
    sdiv_headingdata = "NOHEADINGSET"  # kills off the assertion that happens later.  we can get a "meeting began" before the first heading
    xapian_doc_subheading = None
    sdiv_subheadingdata = None

    headingtermsubheading = set()
    headingtermheading = set()

    lastend = 0

    tdocument_id, gasssess = thinned_docid(document_id)
    docterms = set()
    docterms.add("D%s" % document_id)
    docterms.add("E%s" % document_date[:4])  # year
    docterms.add("E%s" % document_date[:7])  # year+month
    docterms.add("E%s" % document_date)      # full date
    #if document_date > "2001-09-11":
    #    docterms.add("Epost911")      # "9/11 changed everything"

    if gasssess:
        docterms.add("Zga")
        docterms.add("Zga%s" % gasssess)
    else:
        docterms.add("Zsc")

    mdivs = re.finditer('^<div class="([^"]*)" id="([^"]*)"(?: agendanum="([^"]*)")?[^>]*>(.*?)^</div>', doccontent, re.S + re.M)
    for mdiv in mdivs:
        # used to dereference the string as it is in the file
        div_class = mdiv.group(1)
        div_data = (document_id, mdiv.start(), mdiv.end() - mdiv.start(), mdiv.group(2))

        xapian_doc = MakeBaseXapianDoc(mdiv, tdocument_id, document_date, headingtermsubheading)
        for dterm in docterms:
            xapian_doc.add_term(dterm)

        if div_class == "heading":
            assert not xapian_doc_heading, "Only one heading per document"
            xapian_doc_heading = xapian_doc
            sdiv_headingdata = div_data

        # the data put into a xapian object is: speech | document-id | offset | length | heading-id containing this speech | length of full section if this is a heading
        elif div_class in ["subheading", "end-document"]:
            assert xapian_doc_heading
            if xapian_doc_subheading:
                for hterm in headingtermsubheading:
                    xapian_doc_subheading.add_term(hterm)
                dsubheadingdata = "%s|%s|%d|%d|%s|%d" % (sdiv_subheadingdata[3], sdiv_subheadingdata[0], sdiv_subheadingdata[1], sdiv_subheadingdata[2], sdiv_headingdata[3], lastend - sdiv_subheadingdata[1])
                xapian_doc_subheading.set_data(dsubheadingdata)
                xapian_db.add_document(xapian_doc_subheading)

            headingtermheading.update(headingtermsubheading)
            if div_class == "subheading":
                headingtermsubheading.clear()
                xapian_doc_subheading = xapian_doc
                sdiv_subheadingdata = div_data
            else:
                headingtermsubheading = None
                xapian_doc_subheading = None
                sdiv_subheadingdata = None

            if div_class == "end-document":
                for hterm in headingtermheading:
                    xapian_doc_heading.add_term(hterm)
                dheadingdata = "%s|%s|%d|%d|%s|%d" % (sdiv_headingdata[3], sdiv_headingdata[0], sdiv_headingdata[1], sdiv_headingdata[2], "", lastend - sdiv_headingdata[1])
                xapian_doc_heading.set_data(dheadingdata)
                xapian_db.add_document(xapian_doc_heading)
                xapian_doc_heading = None

        else:
            assert div_class in ["assembly-chairs", "council-agenda", "council-attendees", "spoken", "italicline", "italicline-tookchair", "italicline-spokein", "recvote", "boldline"], "unknown divclass:%s" % div_class
            assert sdiv_subheadingdata or sdiv_headingdata
            ddata = "%s|%s|%d|%d|%s|" % (div_data[3], div_data[0], div_data[1], div_data[2], (sdiv_subheadingdata or sdiv_headingdata)[3])
            xapian_doc.set_data(ddata)
            xapian_db.add_document(xapian_doc)

        lastend = mdiv.end()

    # the end-document tag helps us close these headings off
    assert not xapian_doc_subheading and not xapian_doc_heading

    # Note that the document has been indexed
    xapian_db.flush()

    if mdocid.group(3): # unindexed
        pfnameindexed = re.sub(r"\.unindexed", "", pfnameunindexed)
        if os.path.exists(pfnameindexed):
            os.unlink(pfnameindexed)
        #print pfnameunindexed, pfnameindexed
        os.rename(pfnameunindexed, pfnameindexed)
Exemplo n.º 27
0
    def __init__(self, tlcall, i, lundocname, lsdate, speakerbeforetookchair,
                 prevagendanum):
        self.tlcall = tlcall
        self.i = i
        self.sdate = lsdate
        self.undocname = lundocname
        self.bSecurityCouncil = re.match("S-PV.\d+", self.undocname)
        if not self.bSecurityCouncil:
            self.genasssess = re.match("A-(\d+)", self.undocname).group(1)
        self.agendanum = ""

        self.pageno, self.paranum = tlcall[i].txls[0].pageno, tlcall[i].paranum
        # paranum = ( undocname, sdate, tlc.txls[0].pageno, paranumber )
        #self.gid = self.paranum.MakeGid()

        tlc = self.tlcall[self.i]
        #print "\npppp", tlc.indents, tlc.paratext, tlc.txls
        ptext, self.typ, self.speaker = DetectSpeaker(tlc.paratext,
                                                      tlc.indents,
                                                      self.paranum,
                                                      speakerbeforetookchair)
        ptext = MarkupLinks(CleanupTags(ptext, self.typ, self.paranum),
                            self.undocname, self.paranum)
        self.i += 1
        if self.typ in [
                "italicline", "italicline-tookchair", "italicline-spokein"
        ]:
            self.paragraphs = [("italicline", ptext)]
            return

        # series of boldlines
        if self.typ == "boldline":
            self.agendanum = ""
            blinepara = tlc.lastindent and "blockquote" or "p"

            # detect the agenda
            if not self.bSecurityCouncil:
                self.agendanum = DetectAgendaForm(ptext, self.genasssess,
                                                  prevagendanum, self.paranum)
                #print "aaaaa  ", self.agendanum
                if not self.agendanum:
                    if IsNotQuiet():
                        print "if no agenda, add to AgendaTypeMap"
                    raise unexception(" uncategorized agenda title",
                                      self.paranum)

            self.paragraphs = [(blinepara, ptext)]
            while self.i < len(self.tlcall):
                tlc = self.tlcall[self.i]
                if not re.match(reboldline, tlc.paratext):
                    break
                ptext = MarkupLinks(
                    CleanupTags(tlc.paratext, self.typ, self.paranum),
                    self.undocname, self.paranum)

                # a second agenda number gets found
                if not self.bSecurityCouncil and re.match(
                        "Agenda(?: item)? \d+(?i)", ptext):
                    agendanum2 = DetectAgendaForm(ptext, self.genasssess,
                                                  prevagendanum, self.paranum)
                    print "agendanum from second line", agendanum2
                    assert agendanum2, ptext  # must detect it
                    if re.search("misc|show|address", self.agendanum):
                        self.agendanum = agendanum2  # a woolly agenda can be over-ridden
                    elif self.undocname == "A-62-PV.74":
                        self.agendanum = "%s,%s" % (self.agendanum, agendanum2)
                    else:
                        print self.agendanum
                        print ptext
                        raise unexception(" unknown extra agendanum case",
                                          self.paranum)
                    print "aaaa2aa  ", self.agendanum
                self.paragraphs.append((tlc.lastindent and "boldline-indent"
                                        or "boldline-p", ptext))
                self.i += 1
            return

        # actual spoken section
        assert self.typ == "spoken"
        assert tlc.lastindent == 0 or len(
            tlc.indents) == 1  # doesn't happen in first paragraph of speech
        self.paragraphs = [("p", ptext)]
        while self.i < len(self.tlcall):
            tlc = self.tlcall[self.i]
            if self.DetectEndSpeech(tlc.paratext, tlc.lastindent, self.sdate):
                break
            ptext = MarkupLinks(
                CleanupTags(tlc.paratext, self.typ, self.paranum),
                self.undocname, self.paranum)
            bIndent = (len(tlc.indents) == 1) and (
                tlc.indents[0][0] != 0) and (tlc.indents[0][1] > 1)
            self.paragraphs.append(((bIndent and "blockquote" or "p"), ptext))
            self.i += 1
Exemplo n.º 28
0
def DetectSpeaker(ptext, indents, paranum, speakerbeforetookchair):
    #print ptext, "\n\n\n"
    if re.match("<i>(?:In favour|Against|Abstaining)",
                ptext):  # should be part of a voteblock
        print ptext
        #print tlcall[i - 1].paratext
        assert False

    if re.match(
            "(?:The agenda was adopted\.|A vote was taken by show of hands\.|There being no objection, it is so decided\.)$",
            ptext):
        if IsNotQuiet():
            print "italicizingline", len(indents), ptext
        ptext = "<i>%s</i>" % ptext

    indentationerror = ""
    if len(indents) == 1 and indents[0][0] == 0:
        if not re.match("<b> ", ptext) and not re.match(
                "(?:\(|<i>)+spoke in", ptext
        ):  # often there is a speaker with a blank space at the front
            indentationerror = "unindented-paragraph"
    if len(indents) > 2:
        indentationerror = "too many different indents"
    if len(indents) == 2 and indents[1][0] != 0:
        if (indents[0][1] == 1 and ptext[0] == '"'
                and indents[0][0] - indents[1][0] > 30):
            # turn this into a blockquote
            indents[0] = (indents[0][0], indents[0][1] + indents[1][1],
                          indents[0][2] + indents[1][2])
            del indents[1]
            if IsNotQuiet():
                pass
                #print "ququququq", indents
        else:
            indentationerror = "un-left-justified paragraph"

    mfixchinaspek = re.match(
        "<b>(Mr\. \w+)\s*</b>\s*([\w\-]+)\s*\((?:China|Republic of Korea)\)",
        ptext)
    if mfixchinaspek:
        #print "fixing chinaspeak", ptext, "\n"
        ptext = "<b>%s %s</b> %s" % (mfixchinaspek.group(1),
                                     mfixchinaspek.group(2),
                                     ptext[mfixchinaspek.end(2):])
        #print ptext

    if re.search("\s\S\s\S\s\S\s", ptext):
        print ptext
        raise unexception("probable gaps in text", paranum)

    mspek = re.match(respekp1, ptext)
    if not mspek:
        mspek = re.match(respekp2, ptext)
    if not mspek:
        mspek = re.match(respekp3, ptext)
    if not mspek:
        mspek = re.match(respek, ptext)
    assert not mspek or not re.search("[<>]", mspek.group(1))

    if not mspek and re.match("<[ib]>", ptext):
        speakerbeforetookchair = ""

    if mspek or speakerbeforetookchair:
        if indentationerror == "unindented-paragraph" and speakerbeforetookchair:
            indentationerror = False
        if indentationerror == "unindented-paragraph" and paranum.undocname in [
                "A-55-PV.60", "A-55-PV.63", "A-55-PV.64", "A-55-PV.68",
                "A-55-PV.59", "A-55-PV.44", "A-55-PV.46", "A-55-PV.48",
                "A-55-PV.49", "A-55-PV.52", "A-55-PV.56", "A-55-PV.51",
                "A-60-PV.37", "A-60-PV.38", "A-60-PV.42", "A-60-PV.51",
                "A-60-PV.79", "A-60-PV.85", "A-60-PV.91", "A-60-PV.86",
                "A-60-PV.87", "A-60-PV.92", "A-60-PV.93", "A-60-PV.94"
        ]:
            indentationerror = False
        if indentationerror:
            print ptext
            print indents
            raise unexception(indentationerror + " of speaker-intro", paranum)

    if respekSS and not mspek:
        m = re.match(respekSS, ptext)
        if IsNotQuiet():
            print ptext
            print "   ___ ", m and m.group(0)

    if mspek:
        assert not indentationerror
        assert not re.match("<i>", ptext)
        speakr = re.sub("\s+", " ", mspek.group(1).strip())
        nation = ""
        bIsNotnation = True
        lnation = mspek.group(2)

        mbumpnation = re.search("([^(]*?)\s*\(([^)]*)\)$", speakr)
        if mbumpnation and not lnation and FixNationName(
                mbumpnation.group(2), paranum.sdate):
            speakr = mbumpnation.group(1)
            lnation = mbumpnation.group(2)
            if IsNotQuiet():
                print "BBBB bumpingnat", speakr, lnation

        if lnation:
            nation = IsPrenation(lnation, paranum.sdate)
            if not nation:
                nation = FixNationName(lnation, paranum.sdate)
                bIsNotnation = not nation
            if not nation:
                nation = IsNonnation(lnation, paranum.sdate)
            if not nation:
                print ptext
                print "\ncheck if misspelt or new nonnation, can add * to front of it: ", lnation
                raise unexception("unrecognized nationC or nonnation", paranum)
        elif not re.match(
                "The(?: Acting| Temporary)? President|The(?: Deputy| Assistant)? Secretary-General|The(?: Acting)? Chairman|Transcript",
                speakr):
            if IsNotQuiet():  # allow for less strict when done by cronjob
                raise unexception("missing nation for %s" % speakr, paranum)

        if not re.match(
                "Mr\.|Mrs\.|Miss |Ms\.|Pope |The |King |Sultan |Prince |Secretary|Arch|Dr\.|Sir |Sheikh?a? |President |Monsignor |Chairman |Crown |His |Dame |Senator |Cardinal |Chief |Captain |Acting |Begum |Major-General |Shaikh |Judge |Count |Emir |Baroness |General |Nana |Princess |U |Rev\. |Kofi |Sayyid |Sheika |Bishop |Sir. |Wilmot |Eliza |Jos|Lord |Justice |Father |Commodore |Metropolitan |Transcript|Madam ",
                speakr):
            print speakr
            raise unexception("improper title on speaker", paranum)
        if re.search("[\.,:;]$", speakr):
            print speakr
            raise unexception("improper tail on speaker", paranum)
        if re.search("[,:;\(\)]", speakr):
            print speakr
            raise unexception("improper contents in speaker", paranum)

        typ = "spoken"
        currentspeaker = (speakr, nation, (mspek.group(5) or ""), bIsNotnation
                          )  # name, nation, language
        #print currentspeaker
        ptext = ptext[mspek.end(0):]
        if re.search("</b>", ptext):
            print ptext
            raise unexception("bold in spoken text", paranum)

    elif speakerbeforetookchair:
        assert not indentationerror
        typ = "spoken"
        currentspeaker = speakerbeforetookchair
        #print "Continuation speaker", speakerbeforetookchair

    # non-spoken text
    else:
        #<b>Mr. Al-Mahmoud </b>(Qatar) (<i>spoke in Arabic</i>):
        if re.match("<b>.*?(?:</b>.*?:|:</b>)(?!</b>$)", ptext):
            print ptext
            raise unexception("improperly detected spoken text", paranum)

        if re.match("\(?<i>", ptext):
            mballots = re.search("Number of ballot papers", ptext)
            if mballots:
                #print "BALLOT:", ptext, "\n"
                indentationerror = False

            if indentationerror:
                print ptext
                print indents
                raise unexception(indentationerror + " of unspoken text",
                                  paranum)

            if not mballots:
                mptext = re.match(
                    "<i>(.*?)</i>\.?\s*(?:\((?:resolutions?|decision|draft resolution) (A?[\d/]*\s*(?:\(?[A-Z,\s]*(?:and|to) [A-Z]\)?|[A-Z]{1,2})?)\))?\.?$",
                    ptext)
                if not mptext and not re.match("\(<i>spoke in", ptext):
                    print "--%s--" % ptext
                    raise unexception("improper italicline", paranum)

            ptext = re.sub("</?[ib]>", "", ptext).strip()

            # further parsing of these phrases may take place in due course
            msodecided = re.match(
                "(?:There being no objection, )?[Ii]t (?:was|is) so decided(?: \(decision [\d/]*\s*(?:A|B|C|A and B)?\))?\.?$",
                ptext)
            mwasadopted = re.match(
                ".*?(?:resolution|decision|agenda|amendment|recommendation).*?(?:was|were) adopted(?i)",
                ptext)
            mcalledorder = re.match(
                "The meeting (?:was called to order|rose|was suspended|was adjourned|resumed|was resumed) (?:at|on)",
                ptext)
            mtookchair = re.match(
                "\s*(?:In the absence of the President, )?(.*?)(?:, \(?Vice[\-\s]President\)?,)? (?:took|in) the [Cc]hair\.?$",
                ptext)
            mretchair = re.match(
                "(?:The President|.*?, Vice-President,|Mrs. Albright.*?|Baroness Amos) (?:returned to|in) the Chair.$",
                ptext)
            mescort = re.search(
                "(?:was escorted|escorted the.*?) (?:(?:from|to) the (?:rostrum|podium|platform)|(?:from|into|to its place in) the (?:General Assembly Hall|Conference Room|Security Council Chamber))(?: by the President and the Secretary-General)?\.?$",
                ptext)
            msecball = re.search(
                "A vote was taken by secret ballot\.(?: The meeting was suspended at|$)",
                ptext)
            mminsil = re.search(
                "The (?:members of the (?:General )?Assembly|Council) observed (?:a|one) minute of (?:silent prayer (?:or|and) meditation|silence)\.$",
                ptext)
            mtellers = re.search(
                "At the invitations? of the (?:Acting )?Presidents?.*?acted as tellers\.$|Having been drawn by lot",
                ptext)
            melected = re.search(
                "[Hh]aving obtained (?:the required (?:two-thirds )?|an absolute )majority.*?(?:(?:were|was|been|is) s?elected|will be included [io]n the list)",
                ptext)
            mmisc = re.search(
                "The Acting President drew the following.*?from the box|sang.*?for the General Assembly|The Assembly heard a musical performance|The Secretary-General presented the award to|From the .*? Group:|Having been drawn by lot by the (?:President|Secretary-General),|were elected members of the Organizational Committee|President \w+ and then Vice-President|Vice-President \S+ \S+ presided over|The following .*? States have.*?been elected members of the Security Council",
                ptext)
            mmiscnote = re.search("\[In the 79th plenary .*? III.\]$", ptext)
            mmstar = re.match("\*", ptext)  # insert * in the text
            mmspokein = re.match(
                "\(spoke in \w+(?:; interpretation.*?|; .*? the delegation)?\)$",
                ptext)

            matinvite = re.match(
                "(?:At the invitation of the President, )?.*? (?:(?:took (?:a |the )?|were escorted to their )seats? at the Council table|(?:took|was invited to take) (?:(?:the |a |their )?(?:seat|place)s? reserved for \w+|a seat|a place|places|seats|their seats|his seat) at the (?:side of the )?Council (?:[Cc]hamber|table))(?:;.*?Chamber)?\.$",
                ptext)
            mscsilence = re.match(
                "The members of the (?:Security )?Council observed a minute of silence.$",
                ptext)
            mscescort = re.search(
                "(?:were|was) escorted to (?:seats|a seat|his place|a place) at the (?:Security )?Council table.$",
                ptext)
            mvtape = re.match(
                "A video ?(?:tape)? was (?:shown|played|displayed) in the Council Chamber.$|An audio tape, in Arabic,|The members of the General Assembly heard a musical performance.$",
                ptext)
            mvprojscreen = re.match(
                "(?:An image was|Two images were|A video was) projected on screen\.$",
                ptext)
            mvresuadjourned = re.match(
                "The meeting was resumed and adjourned on.*? a\.m\.$", ptext)

            if mmstar:
                ptext = ptext[1:]

            # first line is from general assembly.  Second line adds in some from security council
            if not (msodecided or mwasadopted or mcalledorder or mtookchair or mretchair or mballots or mescort or msecball or mminsil or mtellers or mmisc or melected or mmstar or mmiscnote or mmspokein or \
                    mvprojscreen or matinvite or mscsilence or mscescort or mvtape or mvresuadjourned):
                print "unrecognized--%s--" % ptext
                print re.match("At the invitations? of the (?:Acting )?",
                               ptext)
                raise unexception("unrecognized italicline", paranum)

            # we can add subtypes to these italic-lines
            typ = "italicline"
            if mtookchair or mretchair:
                typ = "italicline-tookchair"
            if mmspokein:
                typ = "italicline-spokein"
            currentspeaker = None

        elif re.match("<b>", ptext):
            if not re.match(reboldline, ptext):
                print ptext
                raise unexception("unrecognized bold completion", paranum)
            ptext = re.sub("</?b>", "", ptext).strip()
            typ = "boldline"
            currentspeaker = None

        else:
            typ = "unknown"
            print ptext, indents
            raise unexception("possible indent failure", paranum)

    return ptext, typ, currentspeaker
Exemplo n.º 29
0
def AppendCluster(res, tlc, sclusttype):
    # check if we should merge to the next paragraph
    assert sclusttype in ["gapcluster", "newpage", "newcolumn"]

    if res and sclusttype != "gapcluster" and len(tlc.indents) == 1:
        indentp = res[-1].indents[-1][0]
        indentn = tlc.indents[0][0]

        bbothindented = ((indentp in [31, 32]) and (indentn in [31, 32])) or \
                        ((indentp in [0, 1]) and (indentn in [0, 1])) or \
                        ((indentp in [36, 33]) and (indentp == indentn))
        bonelineparacont = (len(res[-1].indents)
                            == 1) and (res[-1].indents[0][1] == 1) and (
                                indentp in [31, 32]) and (indentn in [0, 1])

        td0 = res[-1].txls[-1].ltext[:3]
        td1 = tlc.txls[0].ltext[:3]
        if not re.match("<[ib]>", td0):
            td0 = ""
        if not re.match("<[ib]>", td1):
            td1 = ""
        bstylematches = (td0 == td1)
        #assert not (bbothindented and not bstylematches)
        if re.match("<i>In favour", tlc.txls[0].ltext):
            bstylematches = False
        if re.match("<b>Agenda", res[-1].txls[-1].ltext):
            bstylematches = False

        # likely continuation of paragraph
        if bbothindented and bstylematches:
            res[-1].txls.extend(tlc.txls)
            #print tlc.txls[0].ltext
            return
        else:
            if bonelineparacont:
                if IsNotQuiet():
                    pass
                    #print "checkthiscontinuation case"
                    #print indentp, indentn, bstylematches, bonelineparacont, res[-1].indents
                    #print " ----", tlc.txls[0].ltext
                if bstylematches:
                    if IsNotQuiet():
                        pass  #print "merging"
                    res[-1].txls.extend(tlc.txls)
                    return

    # new cluster; check the indenting pattern is good
    if len(tlc.indents) == 2:
        if tlc.indents[0] <= tlc.indents[1]:
            #print tlc.indents, tlc.txls[0].ltext
            #assert re.match("<[ib]>.*?</[ib]>", tlc.txls[0].ltext) # <i>In favour:</i>
            pass

    # two paragraphs may have been merged, try to separate them out
    elif len(tlc.indents) == 4 and tlc.indents[0][0] == tlc.indents[2][
            0] and tlc.indents[1][0] == tlc.indents[3][0]:
        if IsNotQuiet():
            pass  #print tlc.indents
        assert tlc.indents[0][0] == tlc.indents[2][0]
        assert tlc.indents[1][0] == tlc.indents[3][0]
        si = tlc.indents[0][2] + tlc.indents[1][2]
        tlcf = TextLineCluster(None)
        tlcf.txls = tlc.txls[:si]
        del tlc.txls[:si]
        tlcf.indents = tlc.indents[:2]
        del tlc.indents[:2]
        res.append(tlcf)
        if IsNotQuiet():
            pass
            #print "# paragraphs", si
            #print " ", tlc.txls[0].ltext
            #print tlcf.indents, tlc.indents

    elif len(tlc.indents) != 1:
        if IsNotQuiet():
            print tlc.indents, "jjjj"
        prevtop = -1
        for txl in tlc.txls:
            if IsNotQuiet():
                if prevtop == txl.top:
                    print " ",
                print txl.indent, txl.ltext
            prevtop = txl.top
        raise unexception(
            "unrecognized indent pattern",
            paranumC(txl.undocname, None, 0, -1, txl.textcountnumber))
        assert False
    res.append(tlc)
    return
Exemplo n.º 30
0
    def __init__(self, sdate, docid, subheadingid, agendanumstr, titletext):
        self.sdate = sdate
        self.docid = docid
        mdocid = re.match("A-(\d\d)-PV\.(\d+)$", docid)
        assert mdocid, docid
        self.nsession = int(mdocid.group(1))
        self.nmeeting = int(mdocid.group(2))  # so we can sort by it
        self.sortval = (self.nsession, self.nmeeting)

        self.subheadingid = subheadingid
        self.agendanumstr = re.sub("^condolence-.*$", "condolence",
                                   agendanumstr)
        #if self.agendanumstr[0] == "c":
        #    print self.agendanumstr, "kkkk"
        self.agendanums = []
        for agendanum in agendanumstr.split(","):
            sa = agendanum.split("-")
            assert len(sa) == 2
            assert int(sa[1]) == self.nsession
            if sa[0] == "condolence":
                self.agendanums.append(sa[0])
            else:
                self.agendanums.append(agendanum)

        # break the agenda text up by paragraph
        self.titletext = titletext
        self.titlelines = re.findall(
            "<(?:p|blockquote)[^>]*>(.*?)\.?</(?:p|blockquote)>", titletext)

        # loop forwards to remove agenda items as highest priority
        i = 0
        while i < len(self.titlelines):
            # remove the agenda items title parts
            magmatch = rfrontcomm.match(self.titlelines[i])
            if magmatch:
                if magmatch.end(0) == len(self.titlelines[i]):
                    if len(self.titlelines) > 1:
                        del self.titlelines[i]
                        continue
                else:
                    self.titlelines[i] = self.titlelines[i][
                        magmatch.end(0):].capitalize()
            i += 1

        # loop backwards and trim as much as possible from each row of text
        for i in range(len(self.titlelines) - 1, -1, -1):
            # remove trailing references to documents in parentheses
            mtraildoc = re.search(rtraildoc, self.titlelines[i])
            if mtraildoc:
                if mtraildoc.start(0) == 0:
                    if len(self.titlelines) > 1:
                        del self.titlelines[i]
                        continue
                else:
                    self.titlelines[i] = self.titlelines[i][:mtraildoc.start(0
                                                                             )]

            # remove trailing references to reports   ": report of the Fifth Committee (Part III)"
            while True:  # recurse
                mtrailcommrep = rtrailcomm.search(self.titlelines[i])
                if not mtrailcommrep:
                    break
                if mtrailcommrep.start(0) == 0:
                    if len(self.titlelines) > 1:
                        del self.titlelines[i]
                        continue
                else:
                    self.titlelines[i] = self.titlelines[i][:mtrailcommrep.
                                                            start(0)]

            # remove entire lines that are generic
            mgenerline = rgenerline.search(self.titlelines[i])
            if mgenerline and len(self.titlelines) > 1:
                del self.titlelines[i]
                continue

            # substitutions
            for substm, substr in substs:
                self.titlelines[i] = re.sub(substm, substr, self.titlelines[i])

        if re.search("agenda item(?i)", self.titlelines[0]) and IsNotQuiet():
            print "Poss bad agenda item", self.titlelines
        assert self.titlelines
        assert not re.match("\s*$", self.titlelines[0]), self.titletext