class MpSiteKeys(CommonBase): __tablename__ = 'mp_site_keys' rid = Column(BigInteger, primary_key=True, nullable=False, autoincrement=True) pubKey = Column(MEDIUMTEXT(), nullable=False) pubKeyHash = Column(String(255), nullable=False) priKey = Column(MEDIUMTEXT(), nullable=False) priKeyHash = Column(String(255), nullable=False) active = Column(Integer, nullable=True, server_default='1') request_new_key = Column(Integer, nullable=True, server_default='0') mdate = Column(DateTime, nullable=True, server_default='1970-01-01 00:00:00')
class MPAgentRegistration(CommonBase): __tablename__ = 'mp_agent_registration' rid = Column(BigInteger, primary_key=True, autoincrement=True, info='rid') cuuid = Column(String(50), nullable=False, info='Client ID') enabled = Column(Integer, server_default='0', info='Enabled') clientKey = Column(String(100), server_default='NA') pubKeyPem = Column(MEDIUMTEXT()) pubKeyPemHash = Column(MEDIUMTEXT()) hostname = Column(String(255), nullable=False, info='Hostname') serialno = Column(String(255), nullable=False, info='Serial No') reg_date = Column(DateTime, nullable=False, server_default='1970-01-01 00:00:00', info='Reg Date')
class ExampleTable(Base): __tablename__ = "{{ cookiecutter.plugin_name }}_example" example_id = Column(Unicode(64), primary_key=True) example_name = Column(Unicode(120)) example_desc = Column(MEDIUMTEXT(collation="utf8mb4_unicode_ci")) example_type = Column(INTEGER) example_url = Column(MEDIUMTEXT(collation="utf8mb4_unicode_ci")) example_file = Column(Unicode(64)) example_mimetype = Column(Unicode(120)) example_owner = Column(ForeignKey("fsuser.user_id"), nullable=False, index=True) extras = Column(MEDIUMTEXT(collation="utf8mb4_unicode_ci")) tags = Column(MEDIUMTEXT(collation="utf8mb4_unicode_ci")) fsuser = relationship("User")
def get_string_type(col_type, params): """Create a string type column. Args: col_type (string): Type of the column. params (object): Additional parameters. Returns: sqlalchemy.types.TypeEngine: String type like char or text """ if col_type == 'char': return CHAR(params.get('length')) elif col_type == 'json': return ( JSON(none_as_null=True) .with_variant(JSONB(none_as_null=True), 'postgresql') .with_variant(Text(), 'sqlite') ) elif col_type == 'long_text': return LONGTEXT().with_variant(Text(), 'sqlite') elif col_type == 'medium_text': return MEDIUMTEXT().with_variant(Text(), 'sqlite') elif col_type == 'string': return String(length=params.get('length')) elif col_type == 'text': return Text()
class MpOsProfilesCriteria(CommonBase): __tablename__ = 'mp_os_config_profiles_criteria' rid = Column(BigInteger, primary_key=True, autoincrement=True) gPolicyID = Column(String(50), nullable=False) type = Column(String(25)) type_data = Column(MEDIUMTEXT()) type_action = Column(INTEGER(1, unsigned=True), server_default='0') type_order = Column(INTEGER(2, unsigned=True), server_default='0')
class AdHocReports(CommonBase): __tablename__ = 'mp_adhoc_reports' rid = Column(BigInteger, primary_key=True, autoincrement=True) name = Column(String(255), nullable=False) reportData = Column(MEDIUMTEXT()) owner = Column(String(255), nullable=False) rights = Column(INTEGER(1, unsigned=True), server_default='0') disabled = Column(INTEGER(1, unsigned=True), server_default='0') disabledDate = Column(DateTime, server_default='1970-01-01 00:00:00')
class OSMigrationStatus(CommonBase): __tablename__ = 'mp_os_migration_status' rid = Column(BigInteger, primary_key=True, autoincrement=True) cuuid = Column(String(50), ForeignKey('mp_clients.cuuid', ondelete='CASCADE', onupdate='NO ACTION'), nullable=False, index=True, unique=True) startDateTime = Column(DateTime, server_default='1970-01-01 00:00:00') stopDateTime = Column(DateTime, server_default='1970-01-01 00:00:00') preOSVer = Column(String(255), nullable=False) postOSVer = Column(String(255)) label = Column(MEDIUMTEXT()) migrationID = Column(String(100), nullable=False)
class ApplePatchCriteria(CommonBase): __tablename__ = 'mp_apple_patch_criteria' rid = Column(BigInteger, primary_key=True, autoincrement=True) puuid = Column(String(50), nullable=False, server_default='1') supatchname = Column(String(255), nullable=True) type = Column(String(25)) type_data = Column(MEDIUMTEXT()) type_action = Column(INTEGER(1, unsigned=True), server_default='0') type_order = Column(INTEGER(2, unsigned=True), server_default='0') cdate = Column(DateTime, server_default='1970-01-01 00:00:00') mdate = Column(DateTime, server_default='1970-01-01 00:00:00')
class Question(Base): __tablename__ = 'zhihu_questions' id = Column(INTEGER(), primary_key=True) url = Column(VARCHAR(45)) title = Column(NVARCHAR(100)) content = Column(MEDIUMTEXT(), nullable=True) topic = Column(NVARCHAR(200)) answers_num = Column(INTEGER()) follower = Column(INTEGER()) watcher = Column(INTEGER()) crawl_time = Column(DATETIME())
class Answer(Base): __tablename__ = 'zhihu_answers' q_id = Column(INTEGER(), ForeignKey('zhihu_questions.id')) question = relationship('Question', backref='answer') answer_id = Column(INTEGER(), primary_key=True) author_id = Column(VARCHAR(100)) author_name = Column(NVARCHAR(20)) author_is_advertiser = Column(BOOLEAN()) created_time = Column(DATETIME()) updated_time = Column(DATETIME()) voteup_num = Column(INTEGER()) comment_num = Column(INTEGER()) content = Column(MEDIUMTEXT())
class Messages(Base): __tablename__ = 'messages' __table_args__ = {'mysql_engine': 'InnoDB', 'mysql_charset': 'utf8mb4'} message_id = Column(VARCHAR(255), primary_key=True) mailing_list_url = Column(VARCHAR(255), ForeignKey('mailing_lists.mailing_list_url', onupdate='CASCADE', ondelete='CASCADE'), primary_key=True) mailing_list = Column(VARCHAR(255)) first_date = Column(DateTime) first_date_tz = Column(NUMERIC(11)) arrival_date = Column(DateTime) arrival_date_tz = Column(NUMERIC(11)) subject = Column(VARCHAR(1024)) message_body = Column(MEDIUMTEXT()) is_response_of = Column(VARCHAR(255), index=True) mail_path = Column(TEXT) def __repr__(self): return u"<Messages(message_id='{0}', " \ "mailing_list_url='{1}', " \ "mailing_list='{2}', " \ "first_date='{3}', first_date_tz='{4}', " \ "arrival_date='{5}', arrival_date_tz='{6}', " \ "subject='{7}', message_body='{8}', " \ "is_response_of='{9}', " \ "mail_path='{10}')>".format(self.message_id, self.mailing_list_url, self.mailing_list, self.first_date, self.first_date_tz, self.arrival_date, self.arrival_date_tz, self.subject, self.message_body, self.is_response_of, self.mail_path)
def MediumText() -> Variant: # pylint:disable=invalid-name return Text().with_variant(MEDIUMTEXT(), "mysql")
class Scraper(object): GKG_url = 'http://data.gdeltproject.org/gdeltv2/{}.gkg.csv.zip' Mentions_url = 'http://data.gdeltproject.org/gdeltv2/{}.mentions.CSV.zip' Events_url = 'http://data.gdeltproject.org/gdeltv2/{}.export.CSV.zip' colnames_gkg = pd.read_csv( 'C:/Users/605453/Documents/Projects/Firesail/Save Our Jobs/Part 2/Archive/Headers/schema_csvs/GDELT_2.0_gdeltKnowledgeGraph_Column_Labels_Header_Row_Sep2016.tsv', sep='\t')['tableId'] colnames_mentions = pd.read_csv( 'C:/Users/605453/Documents/Projects/Firesail/Save Our Jobs/Part 2/Archive/Headers/schema_csvs/GDELT_2.0_eventMentions_Column_Labels_Header_Row_Sep2016.tsv', sep='\t')['0'] colnames_events = pd.read_csv( 'C:/Users/605453/Documents/Projects/Firesail/Save Our Jobs/Part 2/Archive/Headers/schema_csvs/GDELT_2.0_Events_Column_Labels_Header_Row_Sep2016.csv' )['tableId'] dict_gkg = {colnames_gkg[0]: VARCHAR(255)} dict_gkg.update( {x: MEDIUMTEXT(collation='utf8mb4_bin') for x in colnames_gkg[5:]}) dict_mentions = { x: MEDIUMTEXT(collation='utf8mb4_bin') for x in colnames_mentions[3:] } dict_events = { x: MEDIUMTEXT(collation='utf8mb4_bin') for x in colnames_events[5:] } error_log = "C:\\Users\\605453\\Documents\\GDELT\\Errors.txt" def __init__(self, beg_date, end_date, folder, db): self.beg_month, self.beg_day, self.beg_year = [ int(x) for x in beg_date.split("-") ] self.end_month, self.end_day, self.end_year = [ int(x) for x in end_date.split("-") ] self.folder = folder self.db = db def pull(self, url_type, date, dtype): url = url_type.format(date) file_location = '{}{}.{}.CSV.zip'.format(self.folder, date, dtype) urllib.request.urlretrieve(url, file_location) return file_location def pandafy(self, file_location, colnames, index, GKG_drop=False): try: df = pd.read_csv(file_location, sep='\t', header=None, names=colnames, encoding='utf-8') except: df = pd.read_csv(file_location, sep='\t', header=None, names=colnames, encoding='latin-1') remove(file_location) df = df[df.iloc[:, 0].astype(str).apply(lambda x: len(x) < 255)] df = df.set_index(index) return df def insert(self, df, table_name, db_name, index): engine = create_engine( "mysql://*****:*****@154@localhost/{}?charset=utf8mb4".format( db_name), echo=False) con = engine.connect() if table_name == "mentions": df.to_sql(name=table_name, con=engine, if_exists='append', chunksize=50, method="multi", schema=db_name, index_label=index, dtype=self.dict_mentions) elif table_name == "events": df.to_sql(name=table_name, con=engine, if_exists='append', chunksize=50, method="multi", schema=db_name, index_label=index, dtype=self.dict_events) elif table_name == "gkg": df.to_sql(name=table_name, con=engine, if_exists='append', chunksize=50, method="multi", schema=db_name, index_label=index, dtype=self.dict_gkg) con.close() def execute(self, url, date, table, colnames, index, GKG_drop=False): try: result = self.pull(url, date, table) print(result) df = self.pandafy(result, colnames, index, GKG_drop=False) self.insert(df, table, self.db, index) except: error = "Fail:{}-{}".format(date, table) print(error) print(sys.exc_info()[0]) Errors.append(error) def scrape(self, GKG=False, Mentions=False, Events=False): start = datetime.datetime(year=self.beg_year, month=self.beg_month, day=self.beg_day, hour=00, minute=00) end = datetime.datetime(year=self.end_year, month=self.end_month, day=self.end_day, hour=00, minute=00) days_to_collect = end - start date_list = [ end - datetime.timedelta(minutes=15 * x) for x in range(1, days_to_collect.days * 96 + 1) ] date_list = list( map(lambda x: x.strftime("%Y%m%d%H%M") + '00', date_list)) global Errors Errors = [] for date in date_list: print(date) if GKG: self.execute(self.GKG_url, date, "gkg", self.colnames_gkg, 'GKGRECORDID') if Mentions: self.execute(self.Mentions_url, date, "mentions", self.colnames_mentions, 'GLOBALEVENTID') if Events: self.execute(self.Events_url, date, "events", self.colnames_events, 'GLOBALEVENTID') with open(self.error_log, "w") as outfile: outfile.write("\n".join(Errors)) now = datetime.datetime.now() error_log_final = "C:\\Users\\605453\\Documents\\GDELT\\Error Logs\\Errors_" + now.strftime( "%d%m%Y_%H%M%S") + ".txt" with open(error_log_final, "w") as outfile: outfile.write("\n".join(Errors)) def missedDates(beg_date, end_date, GKG=False, Mentions=False, Events=False): beg_year, beg_month, beg_day, beg_hour, beg_min = int( beg_date[:4]), int(beg_date[4:6]), int(beg_date[6:8]), int( beg_date[8:10]), int(beg_date[10:12]) end_year, end_month, end_day, end_hour, end_min = int( end_date[:4]), int(end_date[4:6]), int(end_date[6:8]), int( end_date[8:10]), int(end_date[10:12]) start = datetime.datetime(year=beg_year, month=beg_month, day=beg_day, hour=beg_hour, minute=beg_min) end = datetime.datetime(year=end_year, month=end_month, day=end_day, hour=end_hour, minute=end_min) days_to_collect = end - start date_list = [end] date_list.extend([ end - datetime.timedelta(minutes=15 * x) for x in range(1, int((days_to_collect.total_seconds() / 60) / 15)) ]) date_list.extend([start])
def MediumText(): return Text().with_variant(MEDIUMTEXT(), 'mysql')
class MySQLEngineSpec(BaseEngineSpec, BasicParametersMixin): engine = "mysql" engine_name = "MySQL" max_column_name_length = 64 default_driver = "mysqldb" sqlalchemy_uri_placeholder = ( "mysql://*****:*****@host:port/dbname[?key=value&key=value...]") encryption_parameters = {"ssl": "1"} column_type_mappings = ( ( re.compile(r"^int.*", re.IGNORECASE), INTEGER(), GenericDataType.NUMERIC, ), ( re.compile(r"^tinyint", re.IGNORECASE), TINYINT(), GenericDataType.NUMERIC, ), ( re.compile(r"^mediumint", re.IGNORECASE), MEDIUMINT(), GenericDataType.NUMERIC, ), ( re.compile(r"^decimal", re.IGNORECASE), DECIMAL(), GenericDataType.NUMERIC, ), ( re.compile(r"^float", re.IGNORECASE), FLOAT(), GenericDataType.NUMERIC, ), ( re.compile(r"^double", re.IGNORECASE), DOUBLE(), GenericDataType.NUMERIC, ), ( re.compile(r"^bit", re.IGNORECASE), BIT(), GenericDataType.NUMERIC, ), ( re.compile(r"^tinytext", re.IGNORECASE), TINYTEXT(), GenericDataType.STRING, ), ( re.compile(r"^mediumtext", re.IGNORECASE), MEDIUMTEXT(), GenericDataType.STRING, ), ( re.compile(r"^longtext", re.IGNORECASE), LONGTEXT(), GenericDataType.STRING, ), ) _time_grain_expressions = { None: "{col}", "PT1S": "DATE_ADD(DATE({col}), " "INTERVAL (HOUR({col})*60*60 + MINUTE({col})*60" " + SECOND({col})) SECOND)", "PT1M": "DATE_ADD(DATE({col}), " "INTERVAL (HOUR({col})*60 + MINUTE({col})) MINUTE)", "PT1H": "DATE_ADD(DATE({col}), " "INTERVAL HOUR({col}) HOUR)", "P1D": "DATE({col})", "P1W": "DATE(DATE_SUB({col}, " "INTERVAL DAYOFWEEK({col}) - 1 DAY))", "P1M": "DATE(DATE_SUB({col}, " "INTERVAL DAYOFMONTH({col}) - 1 DAY))", "P3M": "MAKEDATE(YEAR({col}), 1) " "+ INTERVAL QUARTER({col}) QUARTER - INTERVAL 1 QUARTER", "P1Y": "DATE(DATE_SUB({col}, " "INTERVAL DAYOFYEAR({col}) - 1 DAY))", "1969-12-29T00:00:00Z/P1W": "DATE(DATE_SUB({col}, " "INTERVAL DAYOFWEEK(DATE_SUB({col}, " "INTERVAL 1 DAY)) - 1 DAY))", } type_code_map: Dict[int, str] = {} # loaded from get_datatype only if needed custom_errors: Dict[Pattern[str], Tuple[str, SupersetErrorType, Dict[ str, Any]]] = { CONNECTION_ACCESS_DENIED_REGEX: ( __('Either the username "%(username)s" or the password is incorrect.' ), SupersetErrorType.CONNECTION_ACCESS_DENIED_ERROR, { "invalid": ["username", "password"] }, ), CONNECTION_INVALID_HOSTNAME_REGEX: ( __('Unknown MySQL server host "%(hostname)s".'), SupersetErrorType.CONNECTION_INVALID_HOSTNAME_ERROR, { "invalid": ["host"] }, ), CONNECTION_HOST_DOWN_REGEX: ( __('The host "%(hostname)s" might be down and can\'t be reached.' ), SupersetErrorType.CONNECTION_HOST_DOWN_ERROR, { "invalid": ["host", "port"] }, ), CONNECTION_UNKNOWN_DATABASE_REGEX: ( __('Unable to connect to database "%(database)s".'), SupersetErrorType.CONNECTION_UNKNOWN_DATABASE_ERROR, { "invalid": ["database"] }, ), SYNTAX_ERROR_REGEX: ( __('Please check your query for syntax errors near "%(server_error)s". ' "Then, try running your query again."), SupersetErrorType.SYNTAX_ERROR, {}, ), } @classmethod def convert_dttm( cls, target_type: str, dttm: datetime, db_extra: Optional[Dict[str, Any]] = None) -> Optional[str]: tt = target_type.upper() if tt == utils.TemporalType.DATE: return f"STR_TO_DATE('{dttm.date().isoformat()}', '%Y-%m-%d')" if tt == utils.TemporalType.DATETIME: datetime_formatted = dttm.isoformat(sep=" ", timespec="microseconds") return f"""STR_TO_DATE('{datetime_formatted}', '%Y-%m-%d %H:%i:%s.%f')""" return None @classmethod def adjust_database_uri(cls, uri: URL, selected_schema: Optional[str] = None) -> URL: if selected_schema: uri = uri.set(database=parse.quote(selected_schema, safe="")) return uri @classmethod def get_datatype(cls, type_code: Any) -> Optional[str]: if not cls.type_code_map: # only import and store if needed at least once # pylint: disable=import-outside-toplevel import MySQLdb ft = MySQLdb.constants.FIELD_TYPE cls.type_code_map = { getattr(ft, k): k for k in dir(ft) if not k.startswith("_") } datatype = type_code if isinstance(type_code, int): datatype = cls.type_code_map.get(type_code) if datatype and isinstance(datatype, str) and datatype: return datatype return None @classmethod def epoch_to_dttm(cls) -> str: return "from_unixtime({col})" @classmethod def _extract_error_message(cls, ex: Exception) -> str: """Extract error message for queries""" message = str(ex) try: if isinstance(ex.args, tuple) and len(ex.args) > 1: message = ex.args[1] except (AttributeError, KeyError): pass return message @classmethod def get_column_spec( cls, native_type: Optional[str], db_extra: Optional[Dict[str, Any]] = None, source: utils.ColumnTypeSource = utils.ColumnTypeSource.GET_TABLE, column_type_mappings: Tuple[ColumnTypeMapping, ...] = column_type_mappings, ) -> Optional[ColumnSpec]: column_spec = super().get_column_spec(native_type) if column_spec: return column_spec return super().get_column_spec( native_type, column_type_mappings=column_type_mappings) @classmethod def get_cancel_query_id(cls, cursor: Any, query: Query) -> Optional[str]: """ Get MySQL connection ID that will be used to cancel all other running queries in the same connection. :param cursor: Cursor instance in which the query will be executed :param query: Query instance :return: MySQL Connection ID """ cursor.execute("SELECT CONNECTION_ID()") row = cursor.fetchone() return row[0] @classmethod def cancel_query(cls, cursor: Any, query: Query, cancel_query_id: str) -> bool: """ Cancel query in the underlying database. :param cursor: New cursor instance to the db of the query :param query: Query instance :param cancel_query_id: MySQL Connection ID :return: True if query cancelled successfully, False otherwise """ try: cursor.execute(f"KILL CONNECTION {cancel_query_id}") except Exception: # pylint: disable=broad-except return False return True
from storyboard.db.decorators import UTCDateTime CONF = cfg.CONF def table_args(): engine_name = urlparse.urlparse(cfg.CONF.database_connection).scheme if engine_name == 'mysql': return {'mysql_engine': cfg.CONF.mysql_engine, 'mysql_charset': "utf8"} return None # # CUSTOM TYPES # A mysql medium text type. MYSQL_MEDIUM_TEXT = UnicodeText().with_variant(MEDIUMTEXT(), 'mysql') class CommonLength: top_large_length = 255 top_middle_length = 100 top_short_length = 50 lower_large_length = 5 lower_middle_length = 3 lower_short_length = 1 name_length = 30 class IdMixin(object): id = Column(Integer, primary_key=True)
class MySQLEngineSpec(BaseEngineSpec): engine = "mysql" engine_name = "MySQL" max_column_name_length = 64 column_type_mappings: Tuple[ Tuple[ Pattern[str], Union[TypeEngine, Callable[[Match[str]], TypeEngine]], GenericDataType, ], ..., ] = ( (re.compile(r"^int.*", re.IGNORECASE), INTEGER(), GenericDataType.NUMERIC,), (re.compile(r"^tinyint", re.IGNORECASE), TINYINT(), GenericDataType.NUMERIC,), ( re.compile(r"^mediumint", re.IGNORECASE), MEDIUMINT(), GenericDataType.NUMERIC, ), (re.compile(r"^decimal", re.IGNORECASE), DECIMAL(), GenericDataType.NUMERIC,), (re.compile(r"^float", re.IGNORECASE), FLOAT(), GenericDataType.NUMERIC,), (re.compile(r"^double", re.IGNORECASE), DOUBLE(), GenericDataType.NUMERIC,), (re.compile(r"^bit", re.IGNORECASE), BIT(), GenericDataType.NUMERIC,), (re.compile(r"^tinytext", re.IGNORECASE), TINYTEXT(), GenericDataType.STRING,), ( re.compile(r"^mediumtext", re.IGNORECASE), MEDIUMTEXT(), GenericDataType.STRING, ), (re.compile(r"^longtext", re.IGNORECASE), LONGTEXT(), GenericDataType.STRING,), ) _time_grain_expressions = { None: "{col}", "PT1S": "DATE_ADD(DATE({col}), " "INTERVAL (HOUR({col})*60*60 + MINUTE({col})*60" " + SECOND({col})) SECOND)", "PT1M": "DATE_ADD(DATE({col}), " "INTERVAL (HOUR({col})*60 + MINUTE({col})) MINUTE)", "PT1H": "DATE_ADD(DATE({col}), " "INTERVAL HOUR({col}) HOUR)", "P1D": "DATE({col})", "P1W": "DATE(DATE_SUB({col}, " "INTERVAL DAYOFWEEK({col}) - 1 DAY))", "P1M": "DATE(DATE_SUB({col}, " "INTERVAL DAYOFMONTH({col}) - 1 DAY))", "P0.25Y": "MAKEDATE(YEAR({col}), 1) " "+ INTERVAL QUARTER({col}) QUARTER - INTERVAL 1 QUARTER", "P1Y": "DATE(DATE_SUB({col}, " "INTERVAL DAYOFYEAR({col}) - 1 DAY))", "1969-12-29T00:00:00Z/P1W": "DATE(DATE_SUB({col}, " "INTERVAL DAYOFWEEK(DATE_SUB({col}, " "INTERVAL 1 DAY)) - 1 DAY))", } type_code_map: Dict[int, str] = {} # loaded from get_datatype only if needed custom_errors = { CONNECTION_ACCESS_DENIED_REGEX: ( __('Either the username "%(username)s" or the password is incorrect.'), SupersetErrorType.CONNECTION_ACCESS_DENIED_ERROR, ), CONNECTION_INVALID_HOSTNAME_REGEX: ( __('Unknown MySQL server host "%(hostname)s".'), SupersetErrorType.CONNECTION_INVALID_HOSTNAME_ERROR, ), CONNECTION_HOST_DOWN_REGEX: ( __('The host "%(hostname)s" might be down and can\'t be reached.'), SupersetErrorType.CONNECTION_HOST_DOWN_ERROR, ), CONNECTION_UNKNOWN_DATABASE_REGEX: ( __( 'We were unable to connect to your database named "%(database)s". ' "Please verify your database name and try again." ), SupersetErrorType.CONNECTION_UNKNOWN_DATABASE_ERROR, ), } @classmethod def convert_dttm(cls, target_type: str, dttm: datetime) -> Optional[str]: tt = target_type.upper() if tt == utils.TemporalType.DATE: return f"STR_TO_DATE('{dttm.date().isoformat()}', '%Y-%m-%d')" if tt == utils.TemporalType.DATETIME: datetime_formatted = dttm.isoformat(sep=" ", timespec="microseconds") return f"""STR_TO_DATE('{datetime_formatted}', '%Y-%m-%d %H:%i:%s.%f')""" return None @classmethod def adjust_database_uri( cls, uri: URL, selected_schema: Optional[str] = None ) -> None: if selected_schema: uri.database = parse.quote(selected_schema, safe="") @classmethod def get_datatype(cls, type_code: Any) -> Optional[str]: if not cls.type_code_map: # only import and store if needed at least once import MySQLdb ft = MySQLdb.constants.FIELD_TYPE cls.type_code_map = { getattr(ft, k): k for k in dir(ft) if not k.startswith("_") } datatype = type_code if isinstance(type_code, int): datatype = cls.type_code_map.get(type_code) if datatype and isinstance(datatype, str) and datatype: return datatype return None @classmethod def epoch_to_dttm(cls) -> str: return "from_unixtime({col})" @classmethod def _extract_error_message(cls, ex: Exception) -> str: """Extract error message for queries""" message = str(ex) try: if isinstance(ex.args, tuple) and len(ex.args) > 1: message = ex.args[1] except (AttributeError, KeyError): pass return message @classmethod def get_column_spec( # type: ignore cls, native_type: Optional[str], source: utils.ColumnTypeSource = utils.ColumnTypeSource.GET_TABLE, column_type_mappings: Tuple[ Tuple[ Pattern[str], Union[TypeEngine, Callable[[Match[str]], TypeEngine]], GenericDataType, ], ..., ] = column_type_mappings, ) -> Union[ColumnSpec, None]: column_spec = super().get_column_spec(native_type) if column_spec: return column_spec return super().get_column_spec( native_type, column_type_mappings=column_type_mappings )
class DagCode(Base): """A table for DAGs code. dag_code table contains code of DAG files synchronized by scheduler. For details on dag serialization see SerializedDagModel """ __tablename__ = 'dag_code' fileloc_hash = Column(BigInteger, nullable=False, primary_key=True, autoincrement=False) fileloc = Column(String(2000), nullable=False) # The max length of fileloc exceeds the limit of indexing. last_updated = Column(UtcDateTime, nullable=False) source_code = Column(Text().with_variant(MEDIUMTEXT(), 'mysql'), nullable=False) def __init__(self, full_filepath: str, source_code: Optional[str] = None): self.fileloc = full_filepath self.fileloc_hash = DagCode.dag_fileloc_hash(self.fileloc) self.last_updated = timezone.utcnow() self.source_code = source_code or DagCode.code(self.fileloc) @provide_session def sync_to_db(self, session=None): """Writes code into database. :param session: ORM Session """ self.bulk_sync_to_db([self.fileloc], session) @classmethod @provide_session def bulk_sync_to_db(cls, filelocs: Iterable[str], session=None): """Writes code in bulk into database. :param filelocs: file paths of DAGs to sync :param session: ORM Session """ filelocs = set(filelocs) filelocs_to_hashes = { fileloc: DagCode.dag_fileloc_hash(fileloc) for fileloc in filelocs } existing_orm_dag_codes = (session.query(DagCode).filter( DagCode.fileloc_hash.in_( filelocs_to_hashes.values())).with_for_update( of=DagCode).all()) if existing_orm_dag_codes: existing_orm_dag_codes_map = { orm_dag_code.fileloc: orm_dag_code for orm_dag_code in existing_orm_dag_codes } else: existing_orm_dag_codes_map = {} existing_orm_dag_codes_by_fileloc_hashes = { orm.fileloc_hash: orm for orm in existing_orm_dag_codes } existing_orm_filelocs = { orm.fileloc for orm in existing_orm_dag_codes_by_fileloc_hashes.values() } if not existing_orm_filelocs.issubset(filelocs): conflicting_filelocs = existing_orm_filelocs.difference(filelocs) hashes_to_filelocs = { DagCode.dag_fileloc_hash(fileloc): fileloc for fileloc in filelocs } message = "" for fileloc in conflicting_filelocs: filename = hashes_to_filelocs[DagCode.dag_fileloc_hash( fileloc)] message += ( f"Filename '{filename}' causes a hash collision in the " f"database with '{fileloc}'. Please rename the file.") raise AirflowException(message) existing_filelocs = { dag_code.fileloc for dag_code in existing_orm_dag_codes } missing_filelocs = filelocs.difference(existing_filelocs) for fileloc in missing_filelocs: orm_dag_code = DagCode(fileloc, cls._get_code_from_file(fileloc)) session.add(orm_dag_code) for fileloc in existing_filelocs: current_version = existing_orm_dag_codes_by_fileloc_hashes[ filelocs_to_hashes[fileloc]] file_mod_time = datetime.fromtimestamp(os.path.getmtime( correct_maybe_zipped(fileloc)), tz=timezone.utc) if file_mod_time > current_version.last_updated: orm_dag_code = existing_orm_dag_codes_map[fileloc] orm_dag_code.last_updated = file_mod_time orm_dag_code.source_code = cls._get_code_from_file( orm_dag_code.fileloc) session.merge(orm_dag_code) @classmethod @provide_session def remove_deleted_code(cls, alive_dag_filelocs: List[str], session=None): """Deletes code not included in alive_dag_filelocs. :param alive_dag_filelocs: file paths of alive DAGs :param session: ORM Session """ alive_fileloc_hashes = [ cls.dag_fileloc_hash(fileloc) for fileloc in alive_dag_filelocs ] log.debug("Deleting code from %s table ", cls.__tablename__) session.query(cls).filter( cls.fileloc_hash.notin_(alive_fileloc_hashes), cls.fileloc.notin_(alive_dag_filelocs)).delete( synchronize_session='fetch') @classmethod @provide_session def has_dag(cls, fileloc: str, session=None) -> bool: """Checks a file exist in dag_code table. :param fileloc: the file to check :param session: ORM Session """ fileloc_hash = cls.dag_fileloc_hash(fileloc) return session.query(literal(True)).filter( cls.fileloc_hash == fileloc_hash).one_or_none() is not None @classmethod def get_code_by_fileloc(cls, fileloc: str) -> str: """Returns source code for a given fileloc. :param fileloc: file path of a DAG :return: source code as string """ return cls.code(fileloc) @classmethod def code(cls, fileloc) -> str: """Returns source code for this DagCode object. :return: source code as string """ return cls._get_code_from_db(fileloc) @staticmethod def _get_code_from_file(fileloc): with open_maybe_zipped(fileloc, 'r') as f: code = f.read() return code @classmethod @provide_session def _get_code_from_db(cls, fileloc, session=None): dag_code = session.query(cls).filter( cls.fileloc_hash == cls.dag_fileloc_hash(fileloc)).first() if not dag_code: raise DagCodeNotFound() else: code = dag_code.source_code return code @staticmethod def dag_fileloc_hash(full_filepath: str) -> int: """Hashing file location for indexing. :param full_filepath: full filepath of DAG file :return: hashed full_filepath """ # Hashing is needed because the length of fileloc is 2000 as an Airflow convention, # which is over the limit of indexing. import hashlib # Only 7 bytes because MySQL BigInteger can hold only 8 bytes (signed). return struct.unpack( '>Q', hashlib.sha1(full_filepath.encode('utf-8')).digest()[-8:])[0] >> 8
def MediumText(): return sqlalchemy.Text().with_variant(MEDIUMTEXT(), 'mysql')
def MediumText() -> Variant: return Text().with_variant(MEDIUMTEXT(), "mysql")
def __init__(self, databaseurl, debug): self.engine = create_engine(databaseurl, encoding="utf-8", echo=debug, pool_recycle=True) logging.info("Opened DB connection.") self.meta = MetaData() self.tables['mirror'] = Table( 'mirror', self.meta, #table with file mirrors Column('mid', INTEGER(display_width=10), primary_key=True, nullable=False, autoincrement=True), Column('title', VARCHAR(length=64)), Column('description', TEXT()), Column('country', VARCHAR(length=64)), Column('url_prefix', VARCHAR(length=64)), # prefix to files Column('url_daemon', VARCHAR(length=64)), # absolute url to daemon.php Column('mirror_size', INTEGER(display_width=11)), # maximum size of mirror Column('bandwidth_limit', INTEGER(display_width=11)), # upload speed limit in kb/s Column('status', INTEGER(display_width=4))) # 0=inactive, 1=active self.tables['mirror_file'] = Table( 'mirror_file', self.meta, #table with files on file mirrors Column('mfid', INTEGER(display_width=10), primary_key=True, nullable=False, autoincrement=True), Column('fid', Integer, ForeignKey("file.fid")), Column('mid', INTEGER(display_width=4), ForeignKey("mirror.mid")), # mirror id Column('path', VARCHAR(length=1024)), # relative to (mfid.url_prefix) path Column('lastcheck', DATETIME( timezone=False)), # last time checksum/existence was checked Column( 'status', INTEGER(display_width=4) ), # 0=inactive, 1 = active, 2 = marked for recheck, 3 = broken, 4 = archived (=possible deleted) UniqueConstraint('fid', 'mid')) self.tables['file'] = Table( 'file', self.meta, #all known files Column('fid', INTEGER(display_width=10), primary_key=True, nullable=False, autoincrement=True), #primary key of file Column('uid', INTEGER(display_width=10), default=0, nullable=False), # owner uid of file Column('filename', VARCHAR(length=255), nullable=False, unique=False), # filename (without path) Column('path', VARCHAR(length=1024), default='', nullable=False ), # relative path where file is (without filename!) Column('size', INTEGER(display_width=11), nullable=False), # file size Column( 'status', INTEGER(display_width=11), nullable=False ), # 0=inactive, 1 = active, 2 = marked for recheck, 3 = broken Column('timestamp', TIMESTAMP(timezone=False)), Column('md5', CHAR(length=32), unique=True), Column('sha1', CHAR(length=40)), Column('sha256', CHAR(length=64)), Column('name', VARCHAR(length=256)), #spring name of this file Column('version', VARCHAR(length=256)), #spring version of this file Column( 'cid', INTEGER(display_width=11)), #category of this file: game/map Column('sdp', VARCHAR(length=32), nullable=True, unique=True), #for this file Column('metadata', MEDIUMTEXT()), UniqueConstraint('name', 'version', 'cid'), UniqueConstraint('filename', 'cid')) #self.tables['rapidrepo']=Table('rapidrepo', self.meta, # Column('rid', INTEGER(display_width=10), primary_key=True, nullable=False, autoincrement=True), # Column('baseurl', VARCHAR(length=32), unique=True, nullable=False) # i.e. https://repos.springrts.com/ba/ #self.tables['rapid']=Table('rapid', self.meta, # Column('rid', INTEGER(display_width=10), primary_key=True, nullable=False, autoincrement=True), # Column('fid', Integer, ForeignKey("file.fid"), nullable=True, unique=True), # Column('repo', Integer, ForeignKey("rapid-repo.rid"), nullable=True, unique=True), # Column('sdp', VARCHAR(length=32), unique=True), # Column('timestamp', TIMESTAMP(timezone=False)), self.tables['tag'] = Table( 'tag', self.meta, Column('tid', INTEGER(display_width=10), primary_key=True, nullable=False, autoincrement=True), Column('fid', Integer, ForeignKey("file.fid"), nullable=False), Column('tag', VARCHAR(length=128), unique=True)) self.tables['categories'] = Table( 'categories', self.meta, # file categories Column('cid', INTEGER(display_width=11), primary_key=True, nullable=False, autoincrement=True), Column('name', VARCHAR(length=24), nullable=False)) self.tables['file_depends'] = Table( 'file_depends', self.meta, Column('fid', Integer, ForeignKey("file.fid")), Column( 'depends_fid', Integer, ForeignKey("file.fid"), nullable=True ), #id of other file, if null(couldn't be resolved), use depends_string Column('depends_string', VARCHAR(length=64), nullable=False), UniqueConstraint('fid', 'depends_string')) try: self.meta.create_all(self.engine) except Exception as e: raise Exception("Unable to initialize database %s:%s" % (databaseurl, e)) self.meta.bind = self.engine