Пример #1
0
    def test_eif(self):
        """each XML file in the JATS dir with a matching *complete* output
        in the EIF directory are equal"""

        def xml_fname_to_eif(xml_fname, xml_path):
            return join(self.source_eif_dir, os.path.splitext(xml_fname)[0] + ".json")

        ddiffs = {}
        
        for xml_file, xml_path in self.xml_path_list.items():
            eif_file = xml_fname_to_eif(xml_file, xml_path)
            
            if not os.path.exists(eif_file):
                LOG.info('skipping %s, path `%s` not found', xml_file, eif_file)
                continue
            
            generated_eif = json.loads(feeds.scrape(xml_path, lambda x: x[0]['article'][0]))
            expected_eif = json.load(open(eif_file))

            LOG.info("testing %s", xml_path)
            ddiff = DeepDiff(self.byteify(expected_eif), self.byteify(generated_eif))

            if ddiff:
                ddiffs[eif_file] = ddiff

        if len(ddiffs):
            for attr, value in ddiffs.items():
                print attr
                pprint(value)
                print "\n"
            self.assertTrue(False)
Пример #2
0
    def test_eif_partials(self):
        """each XML file in the JATS dir with a matching *partial*
        output in the EIF/partial directory are present and equal"""

        def xml_fname_to_eif_partial(xml_fname, xml_path):
            return join(self.source_partial_dir, os.path.splitext(xml_fname)[0] + "-match.json")

        ddiffs = {}
        
        for xml_file, xml_path in self.xml_path_list.items():
            eif_path = xml_fname_to_eif_partial(xml_file, xml_path)
            
            if not os.path.exists(eif_path):
                LOG.info('skipping %s, path `%s` not found', xml_file, eif_path)
                continue

            generated_eif = json.loads(feeds.scrape(xml_path, lambda x: x[0]['article'][0]))
            # a list of maps with keys 'description' and 'data'
            eif_partial_tests = json.load(open(eif_path))

            for test in eif_partial_tests:
                if not test.has_key('description') or not test.has_key('data'):
                    LOG.debug('description or data elements not found in file %r, skipping', eif_path)
                    continue

                desc, expected_eif = test['description'], test['data']
                for element, expected_partial_eif in expected_eif.items():
                    has_key = generated_eif.has_key(element)

                    if not has_key:
                        ddiff = "EIF generated from %r doesn't contain expected element %r (in partial file %r)"
                        ddiff = ddiff % (xml_path, element, eif_path)

                    if has_key:
                        ddiff = DeepDiff(self.byteify(expected_partial_eif), self.byteify(generated_eif[element]))

                    if ddiff:
                        if not ddiffs.has_key(eif_path):
                            ddiffs[eif_path] = {}

                        ddiffs[eif_path][desc] = ddiff

        if len(ddiffs):
            for attr, values in ddiffs.items():
                print attr

                for desc, value in values.items():
                    print desc.encode('utf-8')
                    pprint(value)
                    print "\n"
            self.assertTrue(False)
Пример #3
0
    def setUp(self):
        self.results = {}
        self.references = {}
        self.mod = __import__("feeds")
        source_directory = 'JATS/'
        reference_directory = 'JSON/'

        for f in listdir(source_directory):
            if isfile(join(source_directory, f)):
                reference_file_name = f.replace('.xml', '.json')
                with open(source_directory + f, "r") as source_file:
                    source_string = source_file.read()
                    # a bit odd this but seems worthwhile round tripping to match actual results expected
                    res = feeds.scrape(source_string, lambda x: x[0]['article'][0])
                    self.results[reference_file_name] = json.loads(res)

                with open(reference_directory + reference_file_name, "r") as reference_file:
                    self.references[reference_file_name] = json.loads(reference_file.read())
Пример #4
0
    def setUp(self):
        self.results = {}
        self.references = {}
        self.mod = __import__("feeds")
        source_directory = 'JATS/'
        reference_directory = 'JSON/'

        for f in listdir(source_directory):
            if isfile(join(source_directory, f)):
                reference_file_name = f.replace('.xml', '.json')
                with open(source_directory + f, "r") as source_file:
                    source_string = source_file.read()
                    # a bit odd this but seems worthwhile round tripping to match actual results expected
                    res = feeds.scrape(source_string,
                                       lambda x: x[0]['article'][0])
                    self.results[reference_file_name] = json.loads(res)

                with open(reference_directory + reference_file_name,
                          "r") as reference_file:
                    self.references[reference_file_name] = json.loads(
                        reference_file.read())
Пример #5
0
def scrape(xml):

    res = feeds.scrape(xml, lambda x: x[0]['article'][0])

    return res
Пример #6
0
def scrape(xml, article_version=None):

    res = feeds.scrape(xml, lambda x: x[0]['article'][0], article_version)

    return res
Пример #7
0
import pystache


def render(fname, dat):
    #    print len(dat['entries']), len(dat['entries'][0])
    #    return
    f = open(fname)
    template = f.read()
    print pystache.render(template, dat)


if __name__ == '__main__':
    import feeds
    entries = feeds.scrape(feeds.feeds)
    render(
        '/home/ygreif/learndjango/blur/crawler/index.tmpl', {
            'rows': [{
                'entries': entries[i:i + 3]
            } for i in range(2,
                             len(entries) - 6, 3)],
            'lead':
            entries[0],
            'second':
            entries[1],
            'title':
            'The Discerning Whig'
        })
Пример #8
0
def scrape(xml, article_version=None):

    res = feeds.scrape(xml, lambda x: x[0]['article'][0], article_version)

    return res