示例#1
0
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text
from sqlalchemy.schema import CreateSchema
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm.exc import NoResultFound
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.exc import ProgrammingError, IntegrityError
from scraper_lib import raw_config

Base = declarative_base()

SCHEMA = "it-ebooks"
# Used when schema cannot be used
table_prefix = ""

if not raw_config.get("database", "uri").startswith("postgres"):
    SCHEMA = None
    table_prefix = SCHEMA + "_"


class Book(Base):
    __tablename__ = table_prefix + "book"
    __table_args__ = {"schema": SCHEMA}
    id = Column(Integer, primary_key=True, autoincrement=True)
    book_id = Column(Integer, unique=True)
    file_location = Column(String(300))
    file_cover_location = Column(String(300))
    file_cover_source = Column(String(200))
    description = Column(Text)
    file_source = Column(String(200))
    format = Column(String(10))
    isbn = Column(String(20))
示例#2
0
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text
from sqlalchemy.schema import CreateSchema
from sqlalchemy.orm import sessionmaker, scoped_session
from sqlalchemy.orm.exc import NoResultFound
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.exc import ProgrammingError, IntegrityError
from scraper_lib import raw_config

Base = declarative_base()

SCHEMA = 'smbc'
# Used when schema cannot be used
table_prefix = ''

if not raw_config.get('database', 'uri').startswith('postgres'):
    table_prefix = SCHEMA + '_'
    SCHEMA = None


class Comic(Base):
    __tablename__ = table_prefix + 'comic'
    __table_args__ = {'schema': SCHEMA}
    id = Column(Integer, primary_key=True, autoincrement=True)
    time_collected = Column(DateTime)
    posted_at = Column(DateTime)
    comic_id = Column(String(256))
    alt = Column(Text)
    ocr = Column(Text)
    title = Column(String(1024))
    file_path = Column(String(512))
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Date, Text
from sqlalchemy.schema import CreateSchema
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm.exc import NoResultFound
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.exc import ProgrammingError, IntegrityError
from scraper_lib import raw_config

Base = declarative_base()

SCHEMA = 'existentialcomics'
# Used when schema cannot be used
table_prefix = ''

if not raw_config.get('database', 'uri').startswith('postgres'):
    SCHEMA = None
    table_prefix = SCHEMA + '_'


class Comic(Base):
    __tablename__ = table_prefix + 'comic'
    __table_args__ = {'schema': SCHEMA}
    id = Column(Integer, primary_key=True, autoincrement=True)
    time_collected = Column(DateTime)
    comic_id = Column(Integer)
    title = Column(String(1024))
    philosophers = Column(String(2048))  # Will be delimited by `,`
    num_philosophers = Column(Integer)
    alt = Column(Text)
    explanation = Column(Text)  # Will be delimited by `|`
    file_paths = Column(Text)
示例#4
0
    def get_new_proxy(self, iso_country_code='US'):
        """
        if iso_country_code is None, a random proxy will be choosen from a pool of all locales
        :param iso_country_code: String - 2 char country code, case-insensitive, ISO 3166 standard
        :return: a dict with the parts 'protocol', 'ip', 'port', 'ipport'
        """
        selected_proxy = {
            'protocol': None,
            'ip': None,
            'port': None,
            'ipport': None,
        }
        proxicity_enabled = raw_config.getboolean(SCRAPER_NAME,
                                                  'proxicity_enabled')
        if 'proxicity' not in raw_config.sections(
        ) and proxicity_enabled is not True:
            return selected_proxy

        if iso_country_code is None or iso_country_code.upper() == 'ANY':
            iso_country_code = 'US'

        iso_country_code = iso_country_code.upper()

        proxy_source = (
            'https://www.proxicity.io/api/v1/{apikey}/proxy?format=json'
            '&protocol=http'
            # '&country={country}'  # Disabled for now
            '&refererSupport=true'
            '&userAgentSupport=true'
            '&httpsSupport=true'
            '&isAnonymous=true').format(apikey=raw_config.get(
                'proxicity', 'apikey'),
                                        country=iso_country_code)
        while True:
            logger.info("Getting new proxy...")
            response = requests.get(proxy_source, timeout=30)

            if response.status_code == requests.codes.ok:
                json_data = response.json()
                selected_proxy['protocol'] = json_data.get('protocol')
                selected_proxy['ip'] = json_data.get('ip')
                selected_proxy['port'] = json_data.get('port')
                selected_proxy['ipport'] = json_data.get('ipPort')
                selected_proxy['curl'] = json_data.get('curl')
                selected_proxy['country'] = json_data.get('country')

                # Check proxy
                try:
                    logger.info("Test proxy server")
                    test_url = 'https://lumtest.com/myip.json'
                    response_test = requests.get(test_url,
                                                 proxies={
                                                     'http':
                                                     selected_proxy['curl'],
                                                     'https':
                                                     selected_proxy['curl']
                                                 })
                    if response_test.status_code == requests.codes.ok:
                        break
                except Exception:
                    pass
            else:
                logger.error(
                    "Bad response from server while getting a proxy: {status_code}-{json}"
                    .format(status_code=response.status_code,
                            json=response.json()))

            logger.info("Bad proxy, try again")

        return selected_proxy