예제 #1
0
def convert_data_to_table_format():
    logger.info("transform")
    storage = FileStorage(SCRAPPED_FILE)

    # transform gathered data from json file to pandas DataFrame and save as csv
    parser = Parser(storage)
    parser.parse(TABLE_FORMAT_FILE)
예제 #2
0
파일: run.py 프로젝트: avelvi/web-parser
def start():
    if validate_properties(config.get_properties()):
        logger.info('Starting application')
        parser = Parser(config)
        parser.run()
    else:
        logger.info('Something went wrong')
예제 #3
0
def get_metadata_of_incoming_statements(incoming_dir: Path,
                                        ) -> list[IncomingStatement]:
    incoming_statements = []
    for bankpath in sorted(incoming_dir.iterdir()):
        if not bankpath.is_dir():
            continue
        bank = bankpath.name
        bank_parsers = parsers.get(bank)
        if bank_parsers is None:
            print('unknown bank:', bank, file=sys.stderr)
            continue
        filenames = sorted(bankpath.iterdir())
        if filenames:
            print('importing bank statements from', bank)
        for src_file in filenames:
            try:
                extension = src_file.suffix.lower()
                Parser = bank_parsers[extension]
            except KeyError:
                continue
            parser = Parser(src_file)
            m = parser.parse_metadata()
            print(f'{m.start_date} → {m.end_date}: {src_file}')
            incoming_statements.append(IncomingStatement(
                statement_path=src_file,
                parser=parser,
                metadata=m,
                ))
    return incoming_statements
예제 #4
0
    def parse(self, data):
        soup = BeautifulSoup(data, 'html.parser')
        obj = soup.find('span', {'class': "header-profile-login"})
        if not obj:
            raise Parser.IncorrectFormat(data)

        name = obj.text.strip()

        object_list = soup.find_all('a')
        if not object_list:
            raise Parser.IncorrectFormat(data)

        num_books = 0
        for obj in object_list:
            m = re.fullmatch(r"Книги\s*(\d+)\s*", obj.text)
            if m:
                num_books = int(m.group(1))
                break

        object_list = soup.find_all('div', {'class': "group-row-title"})
        if not object_list:
            raise Parser.IncorrectFormat(data)

        birth = None
        death = None
        for obj in object_list:
            if not birth:
                m = re.fullmatch(r"(?:Родился|Родилась):\s*(.*)", obj.text)
                if m:
                    birth = m.group(1)
            if not death:
                m = re.fullmatch(r"(?:Умер|Умерла):\s*(.*)", obj.text)
                if m:
                    death = m.group(1)
            if birth and death:
                break

        birth_date, birth_place = self.parseDate(birth)
        birth_place = re.sub(r'\s+', ' ', birth_place).strip()
        death_date, death_place = self.parseDate(death)
        death_place = re.sub(r'\s+', ' ', death_place).strip()

        obj = soup.find('span', {
                'class': "stats-item marg-right",
                'title': 'Почитатели творчества'
            })

        adepts = int(obj.text if obj is not None else 0)

        obj = soup.find('span', {
                'class': "stats-item marg-right",
                'title': 'Читателей'
            })

        readers = int(obj.text if obj is not None else 0)

        return [
            name, birth_date, birth_place, death_date,
            death_place, num_books, adepts, readers]
class Scraper(object):

    def __init__(self):
        self.__load_config()
        self.parser = Parser()
        self.csv_maker = CsvMaker()
        self.page_number = 1

    def scrape(self):
        data = []
        for page_no in range(1, self.config[PAGES]):
            self.page_number = page_no
            data  = data + self.scrape_page()
        self.csv_maker.make(data)

    def scrape_page(self):
        print "Scraping Page No: {}".format(self.page_number)
        resp = requests.get(self.__url_endpoint(), self.__query_dict())
        soup = BeautifulSoup(resp.text, 'html.parser')
        table = soup.findAll(True, {'class': ['row0', 'row1']})
        return self.parser.parse(table)

    def __load_config(self):
        self.config = yaml.safe_load(open('config.yaml'))

    def __url_endpoint(self):
        return self.config[BASE_URL]

    def __query_dict(self):
        query_dict = {'pp': PER_PAGE_DATA, 'p': self.page_number}
        if self.config.has_key(COURSE):
            query_dict['q'] = self.config[COURSE]
        print query_dict
        return query_dict
예제 #6
0
def parse(city, country="France"):
    tmp = toParse.get(country)
    if not tmp:
        print(f"No such country as {country} is supported", file=sys.stderr)
    url = tmp.get(city)
    if not url:
        print(f"No such city as {city} is supported", file=sys.stderr)
    return Parser.parse(url, city, country)
예제 #7
0
def main():
    filename = process_arguments()

    with open(filename) as filebuffer:

        try:
            lex = Lexer(filebuffer)
            parser = Parser(lex)
            parser.P()

        except EndOfFileError:
            print "Syntax error at line " + str(lex.line)

        except CompilerSyntaxError as e:
            print e
        except CompilerLexError as e:
            print e
예제 #8
0
 def test_parse(self):
     instance = None
     try:
         instance = Parser(files_folder="..\\wrong_path\\to\\files_folder")
     except SystemExit:
         pass
     finally:
         self.assertEqual(instance, None)
예제 #9
0
파일: test.py 프로젝트: TimLang/yue_parser
def test_parsing_return_statement():
    source_code = """
        return a;
    """

    lexer = Lexer(source_code)
    program = Parser(lexer).exec_program()

    assert program.statements[0]._token_iteral == "Return with a"
예제 #10
0
파일: test.py 프로젝트: TimLang/yue_parser
def test_parsing_infix_expression():
    source_code = """
    1-2+3;
    """
    lexer = Lexer(source_code)
    program = Parser(lexer).exec_program()

    print("===========\n")
    print(program.statements[0]._token_iteral)
    print("===========\n")
예제 #11
0
파일: test.py 프로젝트: TimLang/yue_parser
def test_parsing_let_statement():
    source_code = """
        let a = 122;
        let b = 1;
    """
    lexer = Lexer(source_code)
    program = Parser(lexer).exec_program()

    [print(x) for x in lexer.tokens]
    # [print(x._token_iteral) for x in program.statements]

    print(program.statements[0].token_iteral)
    assert len(program._statements) == 2
    assert program.statements[
        0].token_iteral == "This is a Let statement, left is an identifer: a,  right size is value of 122"
예제 #12
0
    def parse(self, data):
        """
        Parses html text and extracts field values
        :param data: html text (page)
        :return: a list of urls with author data
        plus continuation flag
        """
        soup = BeautifulSoup(data, 'html.parser')

        # extract href from
        # <a class=\"arow-name c-black\" href=\"\/author\/30230\">...</a>
        object_list = soup.find_all('a', {'class': 'arow-name c-black'})
        if not object_list:
            raise Parser.IncorrectFormat(data)

        return [x.get('href') for x in object_list]
예제 #13
0
class TestOrmMysql:
    @pytest.fixture(scope='function', autouse=True)
    def setup(self, orm_client):
        self.bd = orm_client
        self.builder = OrmBuilder(orm_client)
        self.parser = Parser()
        self.biggest_request, self.client_error, self.server_error = self.parser.parse_logs(
            log_path='access.log', result='access.log', save_bd=True)

    def test_biggest_request_insert(self):
        for biggest_request in self.biggest_request:
            splitted = biggest_request[1].split()
            self.builder.add_biggest_request(splitted[0], splitted[2],
                                             biggest_request[0])

        res = self.bd.session.query(BiggestRequest).all()
        if len(self.biggest_request) > 10:
            assert len(res) == 10
        else:
            assert len(res) == len(self.biggest_request)

    def test_client_error_insert(self):
        for client_error in self.client_error:
            self.builder.add_client_error(
                client_error[0].split(sep=':')[0],
                int(client_error[0].split(sep=':')[1]), client_error[1])

        res = self.bd.session.query(ClientError).all()
        if len(self.client_error) > 10:
            assert len(res) == 10
        else:
            assert len(res) == len(self.client_error)

    def test_server_error_insert(self):
        for server_error in self.server_error:
            self.builder.add_server_error(
                server_error[0].split(sep=':')[0],
                int(server_error[0].split(sep=':')[1]), server_error[1])

        res = self.bd.session.query(ServerError).all()
        if len(self.server_error) > 10:
            assert len(res) == 10
        else:
            assert len(res) == len(self.server_error)
예제 #14
0
def parse_and_write_bank_statement(
        parser: Parser,
        src_file: Path,
        dest_file: Path,
        rules_dir: Optional[Path],
        import_transaction: ImportTransactionProtocol,
        force: bool,
        dry_run: bool) -> bool:
    if dest_file.exists():
        if force:
            print(f'WARNING: existing {dest_file} will be overwritten',
                  file=sys.stderr)
        else:
            print(f'WARNING: skipping import of already imported {src_file}',
                  file=sys.stderr)
            return False
    try:
        bank_statement = parser.parse(rules_dir=rules_dir)
    except NotImplementedError as e:
        print(f'Warning: couldn\'t parse {src_file}:', e.args,
              file=sys.stderr)
        return False
    if not dry_run:
        try:
            with open(dest_file, 'w') as f:
                bank_statement.write_ledger(f)
        except Exception as e:
            # Remove hledger file to allow clean import after fixing
            # whatever caused the Exception.
            try:
                dest_file.unlink()
            except FileNotFoundError:
                pass
            raise e
    else:
        with io.StringIO() as f:
            bank_statement.write_ledger(f)
            print(f.getvalue())
    import_transaction.add_file(dest_file)
    src_ext = src_file.suffix
    moved_src = dest_file.with_suffix(src_ext)
    import_transaction.move_file_to_annex(src_file, moved_src)
    return True
예제 #15
0
    def prepare_capability_matrix(self):
        self.capability_matrix = {
            TYPE_LOG_EVENT: dict(),
            TYPE_FS_CHANGE: dict(),
            TYPE_NETWORK_PACKET: dict(),
        }

        for parserx in Parser.__subclasses__():
            parser_instance = parserx()
            parser_instance.init()
            parser_capab = parser_instance.get_capabilities()

            for source in parser_capab['feeders_list']:
                if not source in self.capability_matrix[
                        parser_capab['type']].keys():
                    self.capability_matrix[
                        parser_capab['type']][source] = list()

                if not parser_instance in self.capability_matrix[
                        parser_capab['type']][source]:
                    self.capability_matrix[
                        parser_capab['type']][source].append(parser_instance)
예제 #16
0
from generation.generators.frontend.user_generator import UserGenerator as UserModuleGenerator
from generation.generators.frontend.shopping_cart_generator import ShoppingCartGenerator as SCGenerator
from generation.generators.frontend.auth_generator import AuthGenerator
from generation.generators.frontend.home_generator import HomeGenerator
from generation.generators.frontend.starter_generator import StarterGenerator
from generation.generators.frontend.profile_generator import ProfileGenerator
from generation.generators.frontend.product_generator import ProductGenerator
from generation.generators.frontend.item_generator import ItemGenerator
from generation.generators.frontend.category_generator import CategoryGenerator as CategoryGeneratorFront

if __name__ == '__main__':
    try:
        shutil.rmtree('./output')
    except Exception:
        pass
    parser = Parser()
    model = parser.parse(os.path.join(root, "metamodel"), 'scala-angular.tx',
                         'project.scan', True)
    main_generator = MainGenerator()
    model_generator = ModelGenerator(main_generator)
    table_generator = TableGenerator(main_generator)
    repository_generator = RepositoryGenerator(main_generator)
    service_generator = ServiceGenerator(main_generator)
    controller_generator = ControllerGenerator(main_generator)
    dto_generator = DTOGenerator(main_generator)
    jwt_generator = JWTGenerator(main_generator)
    module_generator = ModuleGenerator(main_generator)
    conf_generator = ConfGenerator(main_generator)
    sbt_generator = SbtGenerator(main_generator)
    category_generator = CategoryGenerator(main_generator)
    order_generator = OrderGenerator(main_generator)
예제 #17
0
 def page_items(self):
     return [Parser(e) for e in self.soup.select(Locators.ITEM)]
 def __init__(self):
     self.__load_config()
     self.parser = Parser()
     self.csv_maker = CsvMaker()
     self.page_number = 1
예제 #19
0
#!/usr/bin/env python

from parsers.parser import Parser
from controllers.base_controller import BaseController
from loggers.controller_logger import ControllerLogger
from loggers.request_logger import RequestLogger

# Parser
parser = Parser()
request = parser()
# TODO: Check if connection to socket has already been established.

# RequestLogger
RequestLogger(request.status, request.parsed_args.log_file)()

# Controller
base_controller = BaseController(request.parsed_args)
controller = base_controller()

# Logger
controller_logger = ControllerLogger(
    controller.subcontroller.__class__.__name__,
    controller.subcontroller.action, controller.subcontroller.status,
    controller.subcontroller.data, request.parsed_args.log_file)
controller_logger()
예제 #20
0
def parseAll():
    for country, cities in toParse.items():
        for city, url in cities.items():
            yield Parser.parse(url, city, country)
예제 #21
0
 def setUpClass(cls):
     """
     We are using this class variable during entire testing.
     """
     ParserTest.parser = Parser(files_folder="example_files")
예제 #22
0
 def setup(self, orm_client):
     self.bd = orm_client
     self.builder = OrmBuilder(orm_client)
     self.parser = Parser()
     self.biggest_request, self.client_error, self.server_error = self.parser.parse_logs(
         log_path='access.log', result='access.log', save_bd=True)
예제 #23
0
파일: indexer.py 프로젝트: nemanja97/Query
 def index_data(self):
     start = time()
     parser = Parser()
     self.get_all_files(self.root, parser)
     end = time()
     print(end - start)