from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text from sqlalchemy.schema import CreateSchema from sqlalchemy.orm import sessionmaker from sqlalchemy.orm.exc import NoResultFound from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.exc import ProgrammingError, IntegrityError from scraper_lib import raw_config Base = declarative_base() SCHEMA = "it-ebooks" # Used when schema cannot be used table_prefix = "" if not raw_config.get("database", "uri").startswith("postgres"): SCHEMA = None table_prefix = SCHEMA + "_" class Book(Base): __tablename__ = table_prefix + "book" __table_args__ = {"schema": SCHEMA} id = Column(Integer, primary_key=True, autoincrement=True) book_id = Column(Integer, unique=True) file_location = Column(String(300)) file_cover_location = Column(String(300)) file_cover_source = Column(String(200)) description = Column(Text) file_source = Column(String(200)) format = Column(String(10)) isbn = Column(String(20))
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text from sqlalchemy.schema import CreateSchema from sqlalchemy.orm import sessionmaker, scoped_session from sqlalchemy.orm.exc import NoResultFound from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.exc import ProgrammingError, IntegrityError from scraper_lib import raw_config Base = declarative_base() SCHEMA = 'smbc' # Used when schema cannot be used table_prefix = '' if not raw_config.get('database', 'uri').startswith('postgres'): table_prefix = SCHEMA + '_' SCHEMA = None class Comic(Base): __tablename__ = table_prefix + 'comic' __table_args__ = {'schema': SCHEMA} id = Column(Integer, primary_key=True, autoincrement=True) time_collected = Column(DateTime) posted_at = Column(DateTime) comic_id = Column(String(256)) alt = Column(Text) ocr = Column(Text) title = Column(String(1024)) file_path = Column(String(512))
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Date, Text from sqlalchemy.schema import CreateSchema from sqlalchemy.orm import sessionmaker from sqlalchemy.orm.exc import NoResultFound from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.exc import ProgrammingError, IntegrityError from scraper_lib import raw_config Base = declarative_base() SCHEMA = 'existentialcomics' # Used when schema cannot be used table_prefix = '' if not raw_config.get('database', 'uri').startswith('postgres'): SCHEMA = None table_prefix = SCHEMA + '_' class Comic(Base): __tablename__ = table_prefix + 'comic' __table_args__ = {'schema': SCHEMA} id = Column(Integer, primary_key=True, autoincrement=True) time_collected = Column(DateTime) comic_id = Column(Integer) title = Column(String(1024)) philosophers = Column(String(2048)) # Will be delimited by `,` num_philosophers = Column(Integer) alt = Column(Text) explanation = Column(Text) # Will be delimited by `|` file_paths = Column(Text)
def get_new_proxy(self, iso_country_code='US'): """ if iso_country_code is None, a random proxy will be choosen from a pool of all locales :param iso_country_code: String - 2 char country code, case-insensitive, ISO 3166 standard :return: a dict with the parts 'protocol', 'ip', 'port', 'ipport' """ selected_proxy = { 'protocol': None, 'ip': None, 'port': None, 'ipport': None, } proxicity_enabled = raw_config.getboolean(SCRAPER_NAME, 'proxicity_enabled') if 'proxicity' not in raw_config.sections( ) and proxicity_enabled is not True: return selected_proxy if iso_country_code is None or iso_country_code.upper() == 'ANY': iso_country_code = 'US' iso_country_code = iso_country_code.upper() proxy_source = ( 'https://www.proxicity.io/api/v1/{apikey}/proxy?format=json' '&protocol=http' # '&country={country}' # Disabled for now '&refererSupport=true' '&userAgentSupport=true' '&httpsSupport=true' '&isAnonymous=true').format(apikey=raw_config.get( 'proxicity', 'apikey'), country=iso_country_code) while True: logger.info("Getting new proxy...") response = requests.get(proxy_source, timeout=30) if response.status_code == requests.codes.ok: json_data = response.json() selected_proxy['protocol'] = json_data.get('protocol') selected_proxy['ip'] = json_data.get('ip') selected_proxy['port'] = json_data.get('port') selected_proxy['ipport'] = json_data.get('ipPort') selected_proxy['curl'] = json_data.get('curl') selected_proxy['country'] = json_data.get('country') # Check proxy try: logger.info("Test proxy server") test_url = 'https://lumtest.com/myip.json' response_test = requests.get(test_url, proxies={ 'http': selected_proxy['curl'], 'https': selected_proxy['curl'] }) if response_test.status_code == requests.codes.ok: break except Exception: pass else: logger.error( "Bad response from server while getting a proxy: {status_code}-{json}" .format(status_code=response.status_code, json=response.json())) logger.info("Bad proxy, try again") return selected_proxy