Пример #1
0
def run_update():
    ap_cables = _aftenposten_cable_ids()
    rr_cables = _russianreporter_cable_ids()
    known_cables = ap_cables | rr_cables
    missing_subjects = _file_as_set(_FILE_MISSING_SUBJECTS)
    cable_ids = missing_subjects & known_cables
    for cable_id in cable_ids:
        if cable_id in ap_cables:
            source_iri = _AP_IRI + cable_id + '.html'
            page = urllib2.urlopen(source_iri).read().decode('utf-8', 'ignore')
            ap_iri = _AP_SOURCE_PATTERN.search(page).group(1)
            subject = parse_subject(page)
            if not subject:
                if cable_id == u'09OSLO399': # This cable is malformed
                    subject = u'NORWAYS RUSSIA POLICY: WISHFUL THINKING'
                else:
                    print '-------- ERROR, cannot parse subject of ' + source_iri
                    continue
            _additional_subjects[cable_id] = (subject, ap_iri)
        elif cable_id in rr_cables:
            source_iri = _RR_IRI + cable_id + '.html'
            page = urllib2.urlopen(source_iri).read().decode('utf-8', 'ignore')
            subject = parse_subject(page)
            if not subject:
                print '-------- ERROR, cannot parse subject of ' + source_iri
                continue
            _additional_subjects[cable_id] = (subject, source_iri)
    _write_fixed_subjects(_additional_subjects)
Пример #2
0
def run_update():
    ap_cables = _aftenposten_cable_ids()
    rr_cables = _russianreporter_cable_ids()
    known_cables = ap_cables | rr_cables
    missing_subjects = _file_as_set(_FILE_MISSING_SUBJECTS)
    cable_ids = missing_subjects & known_cables
    for cable_id in cable_ids:
        if cable_id in ap_cables:
            source_iri = _AP_IRI + cable_id + '.html'
            page = urllib2.urlopen(source_iri).read().decode('utf-8', 'ignore')
            ap_iri = _AP_SOURCE_PATTERN.search(page).group(1)
            subject = parse_subject(page)
            if not subject:
                if cable_id == u'09OSLO399':  # This cable is malformed
                    subject = u'NORWAYS RUSSIA POLICY: WISHFUL THINKING'
                else:
                    print '-------- ERROR, cannot parse subject of ' + source_iri
                    continue
            _additional_subjects[cable_id] = (subject, ap_iri)
        elif cable_id in rr_cables:
            source_iri = _RR_IRI + cable_id + '.html'
            page = urllib2.urlopen(source_iri).read().decode('utf-8', 'ignore')
            subject = parse_subject(page)
            if not subject:
                print '-------- ERROR, cannot parse subject of ' + source_iri
                continue
            _additional_subjects[cable_id] = (subject, source_iri)
    _write_fixed_subjects(_additional_subjects)
Пример #3
0
 def subject(self):
     return reader.parse_subject(self.content, self.reference_id)
Пример #4
0
 def subject(self):
     return reader.parse_subject(self.content, self.reference_id)
Пример #5
0
 def check(expected, input):
     eq_(expected, parse_subject(input))
Пример #6
0
 def check(content, clean, expected):
     eq_(expected, parse_subject(content, clean=clean))