示例#1
0
    def inspect(self, parsers=None, log_results=False):
        if not parsers:
            parsers = parser_registry.keys()

        if log_results:
            start_time = datetime.datetime.now()
            (filename, file_url, C) = self.init_csv_writer(
#                    slug='mapper-inspector',
                    header=['index', 'file_count'] +
                        [p for p in parsers] +
                        ['blog', 'filepath', 'timestamp'],
                )

        for (i,blog) in enumerate(self.blog_list):
            results = {}
            for p in parsers:
                results[p] = self.inspect_blog_mapper_pair(blog, p)

            row = [i, self.count_blog_files(blog)] + [results[p] for p in parsers] + \
                [
                    blog,#.split('/')[-1],
                    self.blog_path+blog,
                    datetime.date.strftime(datetime.datetime.now(), "%Y/%m/%d %H:%M:%S")
                ]
            print '\t'.join([str(r) for r in row])

            if log_results:
                C.writerow( row )

        if log_results:
            print 'Output:\t', file_url
示例#2
0
    def inspect_multiple(self, parsers=None, shuffle=False, max_posts=20, verbose=False):
        "Check parsers and see which one seems to fit the best."
            
        if not parsers:
            parsers = parser_registry.keys()
        
        post_count = {}
        perfect_pct = {}
        acceptable_pct = {}

        for p in parsers:
            results = self.inspect_blog_parser_pair(self.blog_url, p, max_posts, shuffle)
#            print results
            post_count[p] = len(results)
            perfect_pct[p] = self.calc_percent_perfect(results)
            acceptable_pct[p] = self.calc_percent_acceptable(results)

            print p
            print '\t', post_count[p], '\tPost count'
            print '\t', round(100*perfect_pct[p]), '\t% perfect'
            print '\t', round(100*acceptable_pct[p]), '\t% acceptable'
            print
            for f in field_keys:
                print '\t', self.calc_success_rate(results, f), f

            print

        best_parser = max(acceptable_pct, key=acceptable_pct.get)
        if acceptable_pct[best_parser] <= 0:
            best_parser = "None"
            best_pct = -1
        else:
            best_pct = acceptable_pct[best_parser]
                    
        print
        print best_parser, ':', round(100*best_pct)
示例#3
0
    def inspect(self, parsers=None, log_results=False, log_summary=False,
        shuffle=False, max_posts=20, verbose=False):
        
        if not parsers:
            parsers = parser_registry.keys()

        if log_results:
            start_time = datetime.datetime.now()
            
            #Initialize the blog-by-parser csv
            header = ['index'] + \
                ['post_count'] + \
                ['pct_perfect', 'pct_acceptable'] + \
                [f for f in field_keys] + \
                ['parser', 'blog', 'filepath', 'timestamp']
            (bxp_filename, bxp_file_url, bxp_csv) = self.init_csv_writer(slug="ParserInspector-BxP",header=header)

        if log_summary:
            #Initialize the blog csv
            header = ['index'] + \
                ["best_parser", "best_pct"] + \
                [p+"_post_count" for p in parsers] + \
                [p+"_pct_perfect" for p in parsers] + \
                [p+"_pct_acceptable" for p in parsers] + \
                ['blog', 'filepath', 'timestamp']
            (summary_filename, summary_file_url, summary_csv) = self.init_csv_writer(slug="ParserInspector-summary",header=header)

        acceptable_matches = 0

        for (i,blog) in enumerate(self.blog_list):
            post_count = {}
            perfect_pct = {}
            acceptable_pct = {}

            for p in parsers:
                results = self.inspect_blog_parser_pair(blog, p, max_posts, shuffle)
                post_count[p] = len(results)
                perfect_pct[p] = self.calc_percent_perfect(results)
                acceptable_pct[p] = self.calc_percent_acceptable(results)

                if log_results:
                    row = [i] + \
                        [post_count[p]] + \
                        [perfect_pct[p] ] + \
                        [acceptable_pct[p] ] + \
                        [self.calc_success_rate(results, f) for f in field_keys] + \
                        [
                            p,
                            blog,#.split('/')[-1],
                            self.blog_path+blog,
                            datetime.date.strftime(datetime.datetime.now(), "%Y/%m/%d %H:%M:%S")
                        ]
                    bxp_csv.writerow( row )

#                    print '\t'.join([str(r) for r in row])

            if log_summary:
                best_parser = max(acceptable_pct, key=acceptable_pct.get)
                if acceptable_pct[best_parser] <= 0:
                    best_parser = "None"
                    best_pct = -1
                else:
                    best_pct = acceptable_pct[best_parser]
                    
                    if best_pct == 1:
                        acceptable_matches += 1
                    
                row = [i] + \
                    [best_parser, best_pct] + \
                    [post_count[p] for p in parsers] + \
                    [perfect_pct[p] for p in parsers] + \
                    [acceptable_pct[p] for p in parsers] + \
                    [
                        blog,#.split('/')[-1],
                        self.blog_path+blog,
                        datetime.date.strftime(datetime.datetime.now(), "%Y/%m/%d %H:%M:%S")
                    ]
                summary_csv.writerow( row )
                print '\t'.join([str(r) for r in row])

        print '='*80
        print acceptable_matches, 'acceptable matches'
        print len(self.blog_list), 'total blogs checked'
        print float(acceptable_matches)/len(self.blog_list), 'percent success'
        print 
        
        if log_results:
            print 'Results file:\t', bxp_file_url

        if log_summary:
            print 'Summary file:\t', summary_file_url