Пример #1
0
    def parse(self, bs_in, fp_out):
        tables = bs_in.find_all('table')

        tparser = HtmlTableParser()
        founded_t = None
        founded_t_num = 0

        for table in tables:
            if not tparser.has_child_tables(table):
                arr = tparser(table)
                i = len(arr)
                if i > 5:
                    founded_t = arr
                    founded_t_num += 1

        if (founded_t is not None) & (founded_t_num > 1):
            self.logger.error("Found more than one results table (%s) in %s" % (str(founded_t_num), self.filename_in_full))
            return []
        if founded_t is None:
            self.logger.error("Could not found any result table in " + self.filename_in_full)
            return []

        # table found
        out_2_file = []
        for row in founded_t:
            if (len(row) > 3) | (len(row) < 2):
                continue
            if (row[0].strip() == '') | (len(row) == 2):
                out_2_file.append(['-----'])
                continue
            out_2_file.append([_.strip() for _ in row])

        array2d_2tsv(out_2_file, fp_out)
        return []
Пример #2
0
    def parse(self, bs_in, fp_out):
        tbl = bs_in.find_all('table')[7]
        row_parser = CombinedRowExtractor(row_extractor_simple, row_extractor_href)
        t_parser = HtmlTableParser(row_parser)

        # move dates and fill regions: t -> [[date, region, value, href]]
        currdate = None
        curregion = None
        cursubregion = None
        newt = []
        i = 0
        for row in t_parser(tbl):
            # if date
            if len(row) == 1:
                currdate = row[0]
            else:
                # if row with region
                if len(row) == 3:
                    if row[0].strip() != '':
                        if row[0].startswith('\xa0\xa0'):
                            cursubregion = row[0]
                        else:
                            curregion = row[0]
                            cursubregion = ''

                    newt.append([_.strip() for _ in [str(i), currdate, curregion, cursubregion, row[1], row[2]]])
                else:
                    newt.append([_.strip() for _ in [str(i), currdate, curregion, cursubregion, row[0], row[1]]])
                assert 'http' in newt[i][5]
                i += 1

        array2d_2tsv(newt, fp_out)

        # [[href, filename, parser],..]
        return ([row[5], self.make_filename_out(str(row[0])), self.subj_parser(self.root_path)] for row in newt)
Пример #3
0
    def parse(self, bs_in, fp_out):
        version = self.detect_version(bs_in)
        out_return = []
        out_2_file = []

        for lnk in get_candlink(bs_in):
            out_return.append([
                lnk[1],
                self.make_filename_out('candidates' if version ==
                                       1 else 'candidates1'),
                RawParserCandidatesList(root_path=self.root_path,
                                        version=version)
            ])
            out_2_file.append(lnk)

        for lnk in get_reslink(bs_in):
            out_return.append([
                lnk[1],
                self.make_filename_out('results'),
                RawParserResultsSummary(root_path=self.root_path,
                                        version=version)
            ])
            out_2_file.append(lnk)

        array2d_2tsv(out_2_file, fp_out)
        if len(out_2_file) != 2:
            self.logger.error(
                'parsing %s for hrefs to cands and res found %s hrefs, not two'
                % (self.filename_in, str(len(out_2_file))))
            return []
        # assert len(out_2_file) == 2
        return out_return
Пример #4
0
 def parse(self, bs_in, fp_out):
     arr = self.parse_candlist(bs_in, 'thead')
     if arr is None:
         self.logger.error("Candidates not found in " +
                           self.filename_in_full)
         return []
     basename = os.path.dirname(self.filename_in_full)
     # ФИО во втором столбцеб ссыль - в последнем
     array2d_2tsv(arr, fp_out)
     return [[
         row[len(row) - 1],
         os.path.join(basename, row[1] + '.html'),
         RawParserCandidateCard(self.root_path, version=2)
     ] for row in arr]
Пример #5
0
    def parse(self, bs_in, fp_out):
        cands = get_candlink(bs_in)
        if self.version is None:
            self.version = self.detect_version(bs_in)
        out_return = []
        out_2_file = []

        if cands is None:
            self.logger.error("Cands not found " + self.filename_in_full)
            return []

        for lnk in cands:
            out_return.append([
                lnk[1],
                self.make_filename_out('candidates' if self.version ==
                                       1 else 'candidates1'),
                RawParserCandidatesList(root_path=self.root_path,
                                        version=self.version)
            ])
            out_2_file.append(lnk)

        subregs = self.find_subregions(bs_in)
        if (subregs is not None) and (len(subregs) > 0):
            self.logger.info("subregs found in " + self.filename_in_full)
            for subreg in subregs:
                out_return.append([
                    subreg[1],
                    self.make_filename_out(subreg[0].split(' ')[0]),
                    RawParserMajorSubjPageSubreg(root_path=self.root_path,
                                                 version=self.version)
                ])
                out_2_file.append(subreg)
            array2d_2tsv(out_2_file, fp_out)
            return out_return

        reslinks = get_reslink(bs_in)
        if len(reslinks) != 1:
            self.logger.error(
                "some troubles with getting results in %s; got reslinks %s" %
                (self.filename_in_full, str(len(reslinks))))
        else:
            out_return.append([
                reslinks[0][1],
                self.make_filename_out('results'),
                RawParserResultsSummary(self.root_path, self.version)
            ])
            out_2_file.append(reslinks[0])

        array2d_2tsv(out_2_file, fp_out)
        return out_return
Пример #6
0
    def parse(self, bs_in, fp_out):
        reslinks = get_reslink(bs_in)
        if len(reslinks) != 1:
            self.logger.error(
                "some troubles with getting results in %s; got reslinks %s" %
                (self.filename_in_full, str(len(reslinks))))
            return []
        array2d_2tsv([reslinks], fp_out)

        return [[
            reslinks[0][1],
            '%s.results%s' % os.path.splitext(self.filename_in_full),
            RawParserResultsSummary(self.root_path, self.version)
        ]]
Пример #7
0
 def parse(self, bs_in, fp_out):
     tparcer = HtmlTableParser()
     tables = [tparcer(_) for _ in bs_in.find_all('table')]
     if self.version == 1:
         tables = list(filter(lambda x: len(x) == 11, tables))
         if len(tables) == 1:
             array2d_2tsv(tables[0], fp_out)
             return []
         self.logger.error("Error in parsing. Found several potential tables in " + self.filename_in_full)
         return []
     if self.version == 2:
         tables = list(filter(lambda x: len(x) == 11, tables))
         if len(tables) == 1:
             array2d_2tsv(tables[0][1:], fp_out)
             return []
         self.logger.error("Error in parsing. Found several potential tables in " + self.filename_in_full)
         return []
     return []