Exemplo n.º 1
0
def get_top(table='secwiki', column='domain', time=2020, top=10):
    """
	取top数据作饼图
	:param table:
	:param column:
	:param time:
	:param top:
	:return type:dict
		:return value:percentage(domain top10+other)
	"""
    so = SQLite("data/secwiki.db")
    sql = "select {column},count(url) as ct from {table} \
		  where ts like '%{time}%' \
		  group by {column} \
		  order by ct DESC".format(column=column, table=table, time=time)
    r = so.query(sql)

    od = OrderedDict()
    for i in r:
        od[i[0]] = i[1]

    od_pec = dict()
    i = 0
    for k, v in od.items():
        if i < top:
            od_pec[k] = round(v / sum(od.values()), 4)
        else:
            break
        i = i + 1
    od_pec['other'] = round(1 - sum(od_pec.values()), 4)
    return od_pec
    def __init__(self):
        # ====  Required vars ===== #
        self.stdin_path = '/dev/null'
        self.stdout_path = '/dev/null'
        self.stderr_path = '/dev/null'
        # self.pidfile_path =  '/var/run/toidaemon/toidaemon.pid'
        self.pidfile_path = PID_FILE_PATH
        self.pidfile_timeout = 5
        # ========================= #

        self.db_name = DB_PATH
        self.db = SQLite(self.db_name)
        self.table = self.db.get(ToiScraper.TABLE_NAME)
        print("Initializing...")
        if not self.table:
            print("No table found with name {0}. Creating it.".format(
                ToiScraper.TABLE_NAME))
            self.table = self.db.create(ToiScraper.TABLE_NAME,
                                        ToiScraper.TABLE_SCHEMA)
        else:
            if not self.table.get_info() == ToiScraper.TABLE_SCHEMA:
                error_str = "Table {0} exists but with incorrect schema".format(
                    ToiScraper.TABLE_NAME)
                print(error_str)
                raise Exception(error_str)
        self.iter_date = self._get_init_date_full()
Exemplo n.º 3
0
	def setup(cls):
		""" Makes sure we have a database """
		
		if cls.did_setup:
			return
		
		# init the database if needed
		if not os.path.exists(DB_FILE):
			
			# make sure the parent directory exists
			if len(os.path.dirname(DB_FILE)) > 0 \
				and not os.path.exists(os.path.dirname(DB_FILE)):
				try:
					os.makedirs(os.path.dirname(DB_FILE))
				except Exception, e:
					print "Failed to create %s: %s" % (os.path.dirname(DB_FILE), e)
					return
			
			# database init
			sql = SQLite.get(DB_FILE)
			sql.create('record_tokens', '''(
					token_id INTEGER PRIMARY KEY,
					record_id INT,
					on_server VARCHAR,
					cookie VARCHAR,
					token VARCHAR,
					secret VARCHAR,
					added TIMESTAMP,
					CONSTRAINT record_server UNIQUE (record_id, on_server) ON CONFLICT REPLACE
				)''')
			sql.execute("CREATE INDEX IF NOT EXISTS record_index ON record_tokens (record_id)")
			sql.execute("CREATE INDEX IF NOT EXISTS server_index ON record_tokens (on_server)")
			sql.execute("CREATE INDEX IF NOT EXISTS cookie_index ON record_tokens (cookie)")
			sql.execute("CREATE INDEX IF NOT EXISTS token_index ON record_tokens (token)")
Exemplo n.º 4
0
	def setup_tables(cls):
		""" Creates the SQLite tables we need, not the tables we deserve.
		Does nothing if the tables/indexes already exist
		"""
		if cls.sqlite_handle is None:
			cls.sqlite_handle = SQLite.get(os.path.join('databases', 'snomed.db'))
		
		# descriptions
		cls.sqlite_handle.create('descriptions', '''(
				concept_id INTEGER PRIMARY KEY,
				lang TEXT,
				term TEXT,
				isa VARCHAR,
				active INT
			)''')
		cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS isa_index ON descriptions (isa)")
		
		# relationships
		cls.sqlite_handle.create('relationships', '''(
				relationship_id INTEGER PRIMARY KEY,
				source_id INT,
				destination_id INT,
				rel_type INT,
				rel_text VARCHAR,
				active INT
			)''')
		cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS source_index ON relationships (source_id)")
		cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS destination_index ON relationships (destination_id)")
		cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS rel_type_index ON relationships (rel_type)")
		cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS rel_text_index ON relationships (rel_text)")
Exemplo n.º 5
0
    def setup_tables(cls):
        """ Creates the SQLite tables we need, not the tables we deserve.
		"""
        if cls.sqlite_handle is None:
            cls.sqlite_handle = SQLite.get("databases/snomed.db")

            # descriptions
        cls.sqlite_handle.create(
            "descriptions",
            """(
				concept_id INTEGER PRIMARY KEY,
				lang TEXT,
				term TEXT,
				isa VARCHAR,
				active INT
			)""",
        )
        cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS isa_index ON descriptions (isa)")

        # relationships
        cls.sqlite_handle.create(
            "relationships",
            """(
				relationship_id INTEGER PRIMARY KEY,
				source_id INT,
				destination_id INT,
				rel_type INT,
				rel_text VARCHAR,
				active INT
			)""",
        )
        cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS source_index ON relationships (source_id)")
        cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS destination_index ON relationships (destination_id)")
        cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS rel_type_index ON relationships (rel_type)")
        cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS rel_text_index ON relationships (rel_text)")
Exemplo n.º 6
0
    def setup_tables(cls):
        """ Creates the SQLite tables we need, not the tables we deserve.
		Does nothing if the tables/indexes already exist
		"""
        if cls.sqlite_handle is None:
            cls.sqlite_handle = SQLite.get(cls.database_path())

        # descriptions
        cls.sqlite_handle.create(
            'descriptions', '''(
				concept_id INTEGER PRIMARY KEY,
				lang TEXT,
				term TEXT,
				isa VARCHAR,
				active INT
			)''')

        # relationships
        cls.sqlite_handle.create(
            'relationships', '''(
				relationship_id INTEGER PRIMARY KEY,
				source_id INT,
				destination_id INT,
				rel_type INT,
				rel_text VARCHAR,
				active INT
			)''')
Exemplo n.º 7
0
	def setup_tables(cls):
		""" Creates the SQLite tables we need, not the tables we deserve.
		Does nothing if the tables/indexes already exist
		"""
		if cls.sqlite_handle is None:
			cls.sqlite_handle = SQLite.get(cls.database_path())
		
		# descriptions
		cls.sqlite_handle.create('descriptions', '''(
				concept_id INTEGER PRIMARY KEY,
				lang TEXT,
				term TEXT,
				isa VARCHAR,
				active INT
			)''')
		
		# relationships
		cls.sqlite_handle.create('relationships', '''(
				relationship_id INTEGER PRIMARY KEY,
				source_id INT,
				destination_id INT,
				rel_type INT,
				rel_text VARCHAR,
				active INT
			)''')
Exemplo n.º 8
0
	def setup_tables(cls):
		""" Creates the SQLite tables and imports SNOMED from flat files, if
		not already done
		"""
		if cls.sqlite_handle is None:
			cls.sqlite_handle = SQLite.get('umls.db')
		
		cls.sqlite_handle.create('snomed', '''(
				concept_id INTEGER PRIMARY KEY,
				lang TEXT,
				term TEXT
			)''')
		
		cls.import_snomed_from_csv()
Exemplo n.º 9
0
    def __init__(self):
        super().__init__()
        from sqlite import SQLite

        absolute = os.path.dirname(os.path.realpath(__file__))
        db_file = os.environ.get("SQLITE_FILE")
        db_file = db_file if db_file else os.path.join(absolute, "databases/rxnorm.db")
        self.db_file = db_file
        self.handled = 0

        self.sqlite = SQLite.get(self.db_file)
        self.sqlite.execute("DROP TABLE IF EXISTS drug_cache")

        self.sqlite.execute(
            """CREATE TABLE drug_cache
						(rxcui varchar, property text, value text)"""
        )

        self.sqlite.execute("CREATE INDEX i_drug_cache ON drug_cache (rxcui, property)")

        self.sqlite.execute("DROP VIEW IF EXISTS drug_treatments_by_ndc")
        self.sqlite.execute(
            """CREATE VIEW drug_treatments_by_ndc as
				select a.value as ndc, b.value as treatment_intent
				from drug_cache a join drug_cache b on a.rxcui=b.rxcui
				where a.property='ndc' and b.property='treatment_intent'
				"""
        )

        self.sqlite.execute("DROP VIEW IF EXISTS drug_classes_by_ndc")
        self.sqlite.execute(
            """CREATE VIEW drug_classes_by_ndc as
				select a.value as ndc, b.value as drug_class
				from drug_cache a join drug_cache b on a.rxcui=b.rxcui
				where a.property='ndc' and b.property='drug_class'
				"""
        )

        self.sqlite.execute("DROP VIEW IF EXISTS drug_ingredients_by_ndc")
        self.sqlite.execute(
            """CREATE VIEW drug_ingredients_by_ndc as
				select a.value as ndc, b.value as drug_ingredient, c.str as ingredient_name
				from drug_cache a join drug_cache b on a.rxcui=b.rxcui
				join RXNCONSO c on c.rxcui=b.value
				where a.property='ndc' and b.property='ingredient'
				and c.sab='RXNORM' and c.tty='IN'
                """
        )
Exemplo n.º 10
0
	def setup_tables(cls):
		if cls.sqlite_handle is None:
			cls.sqlite_handle = SQLite.get('storage.db')
		
		cls.sqlite_handle.create('studies', '''(
			nct UNIQUE,
			updated TIMESTAMP,
			elig_gender INTEGER,
			elig_min_age INTEGER,
			elig_max_age INTEGER,
			elig_population TEXT,
			elig_sampling TEXT,
			elig_accept_healthy INTEGER DEFAULT 0,
			elig_criteria TEXT
		)''')
		
		StudyEligibility.setup_tables()
Exemplo n.º 11
0
def parse_all(fnames, reparse=False):
    """
    格式化为ts、tag、url、title、root_domain、domain、url_path
    :param reparse:是否重新全部解析
    :return:
    """
    sqldb = SQLite('data/secwiki.db')

    # 判断是否重新全部解析
    if reparse:
        fnames = []
        gen_file = glob.iglob(r'data/html/secwiki_*.html')
        sql = 'delete from `secwiki`'
        for gfile in gen_file:
            fnames.append(gfile)
        sqldb.execute(sql)

    if fnames is None:
        print('No new secwiki')
        return

    sql = 'insert into `secwiki` (`ts`,`tag`,`url`,`title`,`root_domain`,`domain`,`url_path`) values(?,?,?,?,?,?,?);'

    for fname in fnames:
        # 判断目标文件本地是否存在
        m = re.search(r'secwiki_(\d+)\.html', fname)
        rname = m.group(1)
        rname = path('data/txt', 'secwiki_' + rname + '.txt')
        if not os.path.exists(path("data/txt")):
            os.mkdir(path("data/txt"))
        if os.path.exists(rname) and os.path.getsize(rname) > 0:
            continue

        # 待统一写入目标文件
        rf = codecs.open(rname, mode='wb')

        # 读本地源文件并解析
        with codecs.open(fname, 'rb') as f:
            all_content = {}
            #print(fname)
            for content in parse_single(f):
                if content:
                    # 解析完写入目标文件
                    k = content[0] + content[2]
                    all_content[k] = content
                    line = "\t".join(content)
                    rf.write(line.encode() + b'\r\n')

            # 批量存入sqlite3
            if all_content:
                sqldb.executemany(sql, all_content.values())

        rf.close()
Exemplo n.º 12
0
def load():
    """
    载入nvd.nist原始exp标记数据
    """
    # 取CVE exp白样本
    so = SQLite('data/nvd.db')
    sql = 'select CVE_Items_cve_CVE_data_meta_ID,CVE_Items_cve_description_description_data_value from nvd_cve where CVE_Items_cve_references_reference_data_tags not like "%Exploit%"'
    cve_0 = sql2cve(so, sql)
    cve_0['label'] = 0

    # 取CVE exp黑样本
    sql = 'select CVE_Items_cve_CVE_data_meta_ID,CVE_Items_cve_description_description_data_value from nvd_cve where CVE_Items_cve_references_reference_data_tags like "%Exploit%"'
    cve_1 = sql2cve(so, sql)
    cve_1['label'] = 1

    cve = pd.concat([cve_0, cve_1])
    print(cve.head())
    cve.to_csv('cve2.csv', index=False)
    return cve
Exemplo n.º 13
0
    def __init__(self):
        super().__init__()
        from sqlite import SQLite
        absolute = os.path.dirname(os.path.realpath(__file__))
        db_file = os.environ.get('SQLITE_FILE')
        db_file = db_file if db_file else os.path.join(absolute,
                                                       'databases/rxnorm.db')
        self.db_file = db_file
        self.handled = 0

        self.sqlite = SQLite.get(self.db_file)
        self.sqlite.execute('DROP TABLE IF EXISTS drug_cache')

        self.sqlite.execute('''CREATE TABLE drug_cache
						(rxcui varchar, property text, value text)''')

        self.sqlite.execute(
            'CREATE INDEX i_drug_cache ON drug_cache (rxcui, property)')

        self.sqlite.execute('DROP VIEW IF EXISTS drug_treatments_by_ndc')
        self.sqlite.execute('''CREATE VIEW drug_treatments_by_ndc as
				select a.value as ndc, b.value as treatment_intent
				from drug_cache a join drug_cache b on a.rxcui=b.rxcui
				where a.property='ndc' and b.property='treatment_intent'
				''')

        self.sqlite.execute('DROP VIEW IF EXISTS drug_classes_by_ndc')
        self.sqlite.execute('''CREATE VIEW drug_classes_by_ndc as
				select a.value as ndc, b.value as drug_class
				from drug_cache a join drug_cache b on a.rxcui=b.rxcui
				where a.property='ndc' and b.property='drug_class'
				''')

        self.sqlite.execute('DROP VIEW IF EXISTS drug_ingredients_by_ndc')
        self.sqlite.execute('''CREATE VIEW drug_ingredients_by_ndc as
				select a.value as ndc, b.value as drug_ingredient, c.str as ingredient_name
				from drug_cache a join drug_cache b on a.rxcui=b.rxcui
				join RXNCONSO c on c.rxcui=b.value
				where a.property='ndc' and b.property='ingredient'
				and c.sab='RXNORM' and c.tty='IN'
                ''')
Exemplo n.º 14
0
    def setup_tables(cls):
        """ Creates the SQLite tables we need, not the tables we deserve.
		"""
        if cls.sqlite_handle is None:
            cls.sqlite_handle = SQLite.get('databases/snomed.db')

        # descriptions
        cls.sqlite_handle.create(
            'descriptions', '''(
				concept_id INTEGER PRIMARY KEY,
				lang TEXT,
				term TEXT,
				isa VARCHAR,
				active INT
			)''')
        cls.sqlite_handle.execute(
            "CREATE INDEX IF NOT EXISTS isa_index ON descriptions (isa)")

        # relationships
        cls.sqlite_handle.create(
            'relationships', '''(
				relationship_id INTEGER PRIMARY KEY,
				source_id INT,
				destination_id INT,
				rel_type INT,
				rel_text VARCHAR,
				active INT
			)''')
        cls.sqlite_handle.execute(
            "CREATE INDEX IF NOT EXISTS source_index ON relationships (source_id)"
        )
        cls.sqlite_handle.execute(
            "CREATE INDEX IF NOT EXISTS destination_index ON relationships (destination_id)"
        )
        cls.sqlite_handle.execute(
            "CREATE INDEX IF NOT EXISTS rel_type_index ON relationships (rel_type)"
        )
        cls.sqlite_handle.execute(
            "CREATE INDEX IF NOT EXISTS rel_text_index ON relationships (rel_text)"
        )
  def __init__(self):
    # ====  Required vars ===== #
    self.stdin_path = '/dev/null'
    self.stdout_path = '/dev/null'
    self.stderr_path = '/dev/null'
    # self.pidfile_path =  '/var/run/toidaemon/toidaemon.pid'
    self.pidfile_path = PID_FILE_PATH
    self.pidfile_timeout = 5
    # ========================= #

    self.db_name = DB_PATH
    self.db = SQLite(self.db_name)
    self.table = self.db.get(ToiScraper.TABLE_NAME)
    logger.info("Initializing...")
    if not self.table:
      logger.info("No table found with name {0}. Creating it.".format(ToiScraper.TABLE_NAME))
      self.table = self.db.create(ToiScraper.TABLE_NAME, ToiScraper.TABLE_SCHEMA)
    else:
      if not self.table.get_info() == ToiScraper.TABLE_SCHEMA:
        error_str = "Table {0} exists but with incorrect schema".format(ToiScraper.TABLE_NAME)
        logger.error(error_str)
        raise Exception(error_str)
    self.iter_date = self._get_init_date_full()
Exemplo n.º 16
0
class ToiScraper():
    TABLE_NAME = 'articles'
    TABLE_SCHEMA = [(u'ds', u'text(10)'), (u'title', u'text'),
                    (u'url', u'text')]
    # Manually observed minimum date on TOI
    INIT_DATE = (2020, 1, 1)
    MIN_ENTRIES = 600
    MAX_SLEEP = 3600

    def __init__(self):
        # ====  Required vars ===== #
        self.stdin_path = '/dev/null'
        self.stdout_path = '/dev/null'
        self.stderr_path = '/dev/null'
        # self.pidfile_path =  '/var/run/toidaemon/toidaemon.pid'
        self.pidfile_path = PID_FILE_PATH
        self.pidfile_timeout = 5
        # ========================= #

        self.db_name = DB_PATH
        self.db = SQLite(self.db_name)
        self.table = self.db.get(ToiScraper.TABLE_NAME)
        print("Initializing...")
        if not self.table:
            print("No table found with name {0}. Creating it.".format(
                ToiScraper.TABLE_NAME))
            self.table = self.db.create(ToiScraper.TABLE_NAME,
                                        ToiScraper.TABLE_SCHEMA)
        else:
            if not self.table.get_info() == ToiScraper.TABLE_SCHEMA:
                error_str = "Table {0} exists but with incorrect schema".format(
                    ToiScraper.TABLE_NAME)
                print(error_str)
                raise Exception(error_str)
        self.iter_date = self._get_init_date_full()

    # Get the last date in the database with at least 600 entries in it (enough to tell that it's full)
    def _get_init_date_full(self):
        print(
            "Retrieving last retrieved date from database with at least {0} in it"
            .format(ToiScraper.MIN_ENTRIES))
        first_date = self.db.execute("""
        SELECT
          a.ds,
          a.count
        FROM (
          SELECT
            ds,
            count(1) AS count
          FROM {0}
          GROUP BY ds
          ORDER BY DATE(ds) DESC
        ) a
        WHERE a.count > {1}
        LIMIT 1;
      """.format(ToiScraper.TABLE_NAME, ToiScraper.MIN_ENTRIES),
                                     get=True)
        if len(first_date) == 0:
            print(
                "No last date with given minimum entries found in DB, starting from beginning."
            )
            return ToiScraper.INIT_DATE
        print("Last date with entries {0} found. {1} entries total.".format(
            first_date[0][0], first_date[0][1]))
        return self.get_next_day(*tuple(map(int, first_date[0][0].split('-'))))

    # Get the last date in the database with entries in it
    def _get_init_date(self):
        print("Retrieving last retrieved date from database")
        first_date = self.db.execute(
            'SELECT ds FROM {0} ORDER BY DATE(ds) DESC LIMIT 1'.format(
                ToiScraper.TABLE_NAME),
            get=True)
        if len(first_date) == 0:
            print("No last date found in DB, starting from beginning.")
            return ToiScraper.INIT_DATE
        print("Last date {0} found.".format(first_date[0]['ds']))
        return self.get_next_day(
            *tuple(map(int, first_date[0]['ds'].split('-'))))

    def get_last_valid_date(self):
        return datetime.utcnow() + timedelta(hours=5, minutes=30)

    # Check if the date is strictly before today in IST
    def is_valid_date(self, year, month, day):
        try:
            datetime(year, month, day)
        except ValueError:
            return False
        cur_time = datetime(year, month, day)
        india_time = self.get_last_valid_date()
        return cur_time + timedelta(
            days=1) < india_time and cur_time >= datetime(
                *ToiScraper.INIT_DATE)

    def compute_url_for_day(self, year, month, day):
        if not self.is_valid_date(year, month, day):
            return None
        # Day count used in TOI URL (1st October, 2015 == 42278)
        day_count = (date(year, month, day) - date(1900, 1, 1)).days + 2
        return "http://timesofindia.indiatimes.com/{year}/{month}/{day}/archivelist/year-{year},month-{month},starttime-{daycount}.cms".format(
            year=year, month=month, day=day, daycount=day_count)

    def get_next_day(self, year, month, day):
        next_day = datetime(year, month, day) + timedelta(days=1)
        return (next_day.year, next_day.month, next_day.day)

    def _retrieve_url_contents(self, url, datetuple):
        print("Request sent to url {0}".format(url))
        req = requests.get(url)
        print("Response retrieved, parsing")
        soup = BeautifulSoup(req.text, 'lxml')
        # Signature of the element we're interested in. We rely on the TOI webpage
        # not to change
        divs = soup.find_all(
            'div',
            style=
            'font-family:arial ;font-size:12;font-weight:bold; color: #006699')
        if not len(divs) == 1:
            error_str = "Found {0} divs matching signature. Aborting.".format(
                len(divs))
            self.error(error_str)
            raise Exception(error_str)
        articles = divs[0].find_all('a')
        print("Found {0} hyperlinks in the archive.".format(len(articles)))
        articles = [a for a in articles if len(a.text) > 0]
        res = []
        titles = set({})
        for art in articles:
            corr_url = self.validate_url(art['href'])
            if corr_url:
                if art.text in titles:
                    continue
                titles.add(art.text)
                res.append([
                    datetime(*datetuple).strftime('%Y-%m-%d'),
                    art.text,
                    corr_url,
                ])
        print("Finished parsing, {0} rows remain".format(len(res)))
        return res

    # TOI specific article URL validation and correction
    def validate_url(self, url):
        URL_CORRECT = 'http://timesofindia.indiatimes.com/'
        URL_STANDARD = 'http://'
        URL_INSIDE = '.indiatimes.com/'
        if not url.startswith(URL_STANDARD) or not URL_INSIDE in url:
            if not url.endswith('.cms') or 'http' in url or ' ' in url:
                return None
            else:
                return URL_CORRECT + url
        return url

    def dedup_insert(self, data, ds):
        date_str = '-'.join(map(str, ds))
        print("Asking to insert {0} articles in {1}".format(
            len(data), date_str))
        rows = self.table.where({'ds': date_str})
        print("Already {0} rows exist in {1}".format(len(rows), date_str))
        titles = set({})
        res = []
        for a in rows:
            if not a['title'] in titles:
                titles.add(a['title'])
                res.append((a['ds'], a['title'], a['url']))
        for r in data:
            if not r[1] in titles:
                titles.add(r[1])
                res.append(r)
        print("{0} rows left after deduplicating".format(len(res)))
        if len(rows) > 0:
            print("Deleting {0} rows from {1}".format(len(rows), date_str))
            self.table.del_where({'ds': date_str})
        if len(res) > 0:
            print("Inserting {0} rows from {1}".format(len(res), date_str))
            self.table.insert(res)

    def get_articles_for_day(self, year, month, day):
        print("Getting articles for the day")
        url = self.compute_url_for_day(year, month, day)
        if not url:
            return 0
        data = self._retrieve_url_contents(url, (year, month, day))
        self.dedup_insert(data, (year, month, day))
        return len(data)

    def run(self):
        while True:
            while not self.is_valid_date(*self.iter_date):
                next_date = datetime(*self.iter_date) + timedelta(days=1)
                sec_to_next_date = (next_date -
                                    self.get_last_valid_date()).seconds
                print("Reached the end, {0} seconds until {1}".format(
                    sec_to_next_date,
                    datetime(*self.iter_date).strftime('%Y-%m-%d')))
                if sec_to_next_date <= ToiScraper.MAX_SLEEP:
                    time.sleep(sec_to_next_date)
                else:
                    print(
                        'Seconds till next day {0} greater than {1}, so only sleeping for {1}'
                        .format(sec_to_next_date, ToiScraper.MAX_SLEEP))
                    time.sleep(ToiScraper.MAX_SLEEP)
                print('Woken up, getting init date again')
                self.iter_date = self._get_init_date_full()
                print('New date set to {0}'.format(self.iter_date))
            print("Retrieving articles for date {0}".format(self.iter_date))
            num_rows = self.get_articles_for_day(*self.iter_date)
            print("Retrieved {0} rows from TOI".format(num_rows))
            if num_rows == 0:
                print("Sleeping for 10 seconds, no rows retrieved")
                time.sleep(10)
            else:
                self.iter_date = self.get_next_day(*self.iter_date)
                print("Iterated to next day - {0}".format(
                    datetime(*self.iter_date)))
Exemplo n.º 17
0
	def __init__(self):
		absolute = os.path.dirname(os.path.realpath(__file__))
		self.sqlite = SQLite.get(os.path.join(absolute, 'databases/umls.db'))
class ToiScraper():
  TABLE_NAME = 'articles'
  TABLE_SCHEMA = [(u'ds', u'text(10)'), (u'title', u'text'), (u'url', u'text')]
  # Manually observed minimum date on TOI
  INIT_DATE = (2000, 1, 18)
  MIN_ENTRIES = 600
  MAX_SLEEP = 3600

  def __init__(self):
    # ====  Required vars ===== #
    self.stdin_path = '/dev/null'
    self.stdout_path = '/dev/null'
    self.stderr_path = '/dev/null'
    # self.pidfile_path =  '/var/run/toidaemon/toidaemon.pid'
    self.pidfile_path = PID_FILE_PATH
    self.pidfile_timeout = 5
    # ========================= #

    self.db_name = DB_PATH
    self.db = SQLite(self.db_name)
    self.table = self.db.get(ToiScraper.TABLE_NAME)
    logger.info("Initializing...")
    if not self.table:
      logger.info("No table found with name {0}. Creating it.".format(ToiScraper.TABLE_NAME))
      self.table = self.db.create(ToiScraper.TABLE_NAME, ToiScraper.TABLE_SCHEMA)
    else:
      if not self.table.get_info() == ToiScraper.TABLE_SCHEMA:
        error_str = "Table {0} exists but with incorrect schema".format(ToiScraper.TABLE_NAME)
        logger.error(error_str)
        raise Exception(error_str)
    self.iter_date = self._get_init_date_full()

  # Get the last date in the database with at least 600 entries in it (enough to tell that it's full)
  def _get_init_date_full(self):
    logger.info("Retrieving last retrieved date from database with at least {0} in it".format(ToiScraper.MIN_ENTRIES))
    first_date = self.db.execute("""
        SELECT
          a.ds,
          a.count
        FROM (
          SELECT
            ds,
            count(1) AS count
          FROM {0}
          GROUP BY ds
          ORDER BY DATE(ds) DESC
        ) a
        WHERE a.count > {1}
        LIMIT 1;
      """.format(ToiScraper.TABLE_NAME, ToiScraper.MIN_ENTRIES),
      get=True
    )
    if len(first_date) == 0:
      logger.info("No last date with given minimum entries found in DB, starting from beginning.")
      return ToiScraper.INIT_DATE
    logger.info("Last date with entries {0} found. {1} entries total.".format(first_date[0][0], first_date[0][1]))
    return self.get_next_day(*tuple(map(int, first_date[0][0].split('-'))))


  # Get the last date in the database with entries in it
  def _get_init_date(self):
    logger.info("Retrieving last retrieved date from database")
    first_date = self.db.execute('SELECT ds FROM {0} ORDER BY DATE(ds) DESC LIMIT 1'.format(ToiScraper.TABLE_NAME), get=True)
    if len(first_date) == 0:
      logger.info("No last date found in DB, starting from beginning.")
      return ToiScraper.INIT_DATE
    logger.info("Last date {0} found.".format(first_date[0]['ds']))
    return self.get_next_day(*tuple(map(int, first_date[0]['ds'].split('-'))))

  def get_last_valid_date(self):
    return datetime.utcnow() + timedelta(hours=5, minutes=30)

  # Check if the date is strictly before today in IST
  def is_valid_date(self, year, month, day):
    try:
      datetime(year, month, day)
    except ValueError:
      return False
    cur_time = datetime(year, month, day)
    india_time = self.get_last_valid_date()
    return cur_time + timedelta(days = 1) < india_time and cur_time >= datetime(*ToiScraper.INIT_DATE)

  def compute_url_for_day(self, year, month, day):
    if not self.is_valid_date(year, month, day):
      return None
    # Day count used in TOI URL (1st October, 2015 == 42278)
    day_count = (date(year, month, day) - date(1900, 1, 1)).days + 2
    return "http://timesofindia.indiatimes.com/{year}/{month}/{day}/archivelist/year-{year},month-{month},starttime-{daycount}.cms".format(
        year = year,
        month = month,
        day = day,
        daycount = day_count
      )

  def get_next_day(self, year, month, day):
    next_day = datetime(year, month, day) + timedelta(days = 1)
    return (next_day.year, next_day.month, next_day.day)

  def _retrieve_url_contents(self, url, datetuple):
    logger.debug("Request sent to url {0}".format(url))
    req = requests.get(url)
    logger.debug("Response retrieved, parsing")
    soup = BeautifulSoup(req.text, 'lxml')
    # Signature of the element we're interested in. We rely on the TOI webpage
    # not to change
    divs = soup.find_all('div', style='font-family:arial ;font-size:12;font-weight:bold; color: #006699')
    if not len(divs) == 1:
      error_str = "Found {0} divs matching signature. Aborting.".format(len(divs))
      self.error(error_str)
      raise Exception(error_str)
    articles = divs[0].find_all('a')
    logger.debug("Found {0} hyperlinks in the archive.".format(len(articles)))
    articles = [a for a in articles if len(a.text) > 0]
    res = []
    titles = set({})
    for art in articles:
      corr_url = self.validate_url(art['href'])
      if corr_url:
        if art.text in titles:
          continue
        titles.add(art.text)
        res.append([
          datetime(*datetuple).strftime('%Y-%m-%d'),
          art.text,
          corr_url,
        ])
    logger.debug("Finished parsing, {0} rows remain".format(len(res)))
    return res

  # TOI specific article URL validation and correction
  def validate_url(self, url):
    URL_CORRECT = 'http://timesofindia.indiatimes.com/'
    URL_STANDARD = 'http://'
    URL_INSIDE = '.indiatimes.com/'
    if not url.startswith(URL_STANDARD) or not URL_INSIDE in url:
      if not url.endswith('.cms') or 'http' in url or ' ' in url:
        return None
      else:
        return URL_CORRECT + url
    return url

  def dedup_insert(self, data, ds):
    date_str = '-'.join(map(str, ds))
    logger.debug("Asking to insert {0} articles in {1}".format(len(data), date_str))
    rows = self.table.where({'ds': date_str})
    logger.debug("Already {0} rows exist in {1}".format(len(rows), date_str))
    titles = set({})
    res = []
    for a in rows:
      if not a['title'] in titles:
        titles.add(a['title'])
        res.append((a['ds'], a['title'], a['url']))
    for r in data:
      if not r[1] in titles:
        titles.add(r[1])
        res.append(r)
    logger.debug("{0} rows left after deduplicating".format(len(res)))
    if len(rows) > 0:
      logger.info("Deleting {0} rows from {1}".format(len(rows), date_str))
      self.table.del_where({'ds': date_str})
    if len(res) > 0:
      logger.info("Inserting {0} rows from {1}".format(len(res), date_str))
      self.table.insert(res)

  def get_articles_for_day(self, year, month, day):
    logger.debug("Getting articles for the day")
    url = self.compute_url_for_day(year, month, day)
    if not url:
      return 0
    data = self._retrieve_url_contents(url, (year, month, day))
    self.dedup_insert(data, (year, month, day))
    return len(data)

  def run(self):
    while True:
      while not self.is_valid_date(*self.iter_date):
        next_date = datetime(*self.iter_date) + timedelta(days=1)
        sec_to_next_date = (next_date - self.get_last_valid_date()).seconds
        logger.info("Reached the end, {0} seconds until {1}".format(sec_to_next_date, datetime(*self.iter_date).strftime('%Y-%m-%d')))
        if sec_to_next_date <= ToiScraper.MAX_SLEEP:
          time.sleep(sec_to_next_date)
        else:
          logger.info('Seconds till next day {0} greater than {1}, so only sleeping for {1}'.format(sec_to_next_date, ToiScraper.MAX_SLEEP))
          time.sleep(ToiScraper.MAX_SLEEP)
        logger.info('Woken up, getting init date again')
        self.iter_date = self._get_init_date_full()
        logger.info('New date set to {0}'.format(self.iter_date))
      logger.info("Retrieving articles for date {0}".format(self.iter_date))
      num_rows = self.get_articles_for_day(*self.iter_date)
      logger.info("Retrieved {0} rows from TOI".format(num_rows))
      if num_rows == 0:
        logger.debug("Sleeping for 10 seconds, no rows retrieved")
        time.sleep(10)
      else:
        self.iter_date = self.get_next_day(*self.iter_date)
        logger.debug("Iterated to next day - {0}".format(datetime(*self.iter_date)))
Exemplo n.º 19
0
 def __init__(self):
     self.sqlite = SQLite.get("databases/umls.db")
Exemplo n.º 20
0
 def __init__(self):
     self.sqlite = SQLite.get("databases/rxnorm.db")
Exemplo n.º 21
0
 def __init__(self):
     self.sqlite = SQLite.get('databases/snomed.db')
Exemplo n.º 22
0
__license__ = 'MIT'
__copyright__ = 'Copyright (c) 2015 Muntashir Al-Islam'

"""
    Sample Database Test
    Tasted on v0.2.0
    Note: Error handling will not work on v0.1.0

    Date: 22 Oct, 2015
"""

began = datetime.now()
print("Program starts at:", began)
print("Connecting to Database...", end=' ')

sqlite = SQLite(":memory:")

print("...")

if sqlite.connect_error: raise sqlite.connect_errno(sqlite.connect_error)
else: print("Connected to Database.")

print("Creating a table...", end=' ')
stmt = sqlite.prepare("""CREATE TABLE sample (
        ID integer PRIMARY KEY AUTOINCREMENT NOT NULL,
        Name text
        )""")
stmt.execute()
print("...")
if stmt.error: raise stmt.errno(stmt.error)
else: print("Table created.")
Exemplo n.º 23
0
        ts = float(ts)
    ts_str = datetime.datetime.fromtimestamp(ts).strftime(
        '%Y-%m-%d %H:%M:%S.%f')
    return ts_str


def get_md5(path):
    return hashlib.md5(open(path, 'rb').read()).hexdigest()


def get_sha1(path):
    return hashlib.sha1(open(path, 'rb').read()).hexdigest()


if __name__ == '__main__':
    sqlite = SQLite('data.db')
    # sqlite.insert()
    while True:
        input_dir = input('Enter folder path: ')
        if isinstance(input_dir, str):
            work_dir = input_dir
            break
        else:
            pass
    # work_dir = 'D:\\共享区'
    for parent, dirnames, filenames in os.walk(work_dir, followlinks=True):
        for filename in filenames:
            file_path = os.path.join(parent, filename)
            file_attr = stat(file_path)
            attr_list = [
                file_attr.st_mode, file_attr.st_uid, file_attr.st_gid,
Exemplo n.º 24
0
	def __init__(self):
		self.sqlite = SQLite.get(SNOMED.database_path())
Exemplo n.º 25
0
	def __init__(self):
		self.sqlite = SQLite.get(DB_FILE)
Exemplo n.º 26
0
 def __init__(self):
     absoulte = os.path.dirname(os.path.realpath(__file__))
     self.sqlite = SQLite.get("../../databases/umls.db")
Exemplo n.º 27
0
 def __init__(self):
     self.sqlite = SQLite.get('databases/rxnorm.db')
Exemplo n.º 28
0
 def __init__(self):
     absolute = os.path.dirname(os.path.realpath(__file__))
     self.sqlite = SQLite.get(os.path.join(absolute, "databases/rxnorm.db"))
Exemplo n.º 29
0
 def __init__(self):
     self.sqlite = SQLite.get('databases/umls.db')
Exemplo n.º 30
0
 def __init__(self):
     self.sqlite = SQLite.get("databases/snomed.db")
Exemplo n.º 31
0
 def __init__(self):
     self.sqlite = SQLite.get(SNOMED.database_path())
Exemplo n.º 32
0
	def sqlite_assure_handle(cls):
		if cls.sqlite_handle is None:
			cls.sqlite_handle = SQLite.get(cls.sqlite_default_db)