async def parse_feed(username, url, data_directory): session = config.Session() try: stop = False feed = feedparser.parse(url) for x in feed['entries']: if stop: break current_data = datetime.strptime(x['published'], '%a, %d %b %Y %H:%M:%S %z') for i, media in enumerate(x['media_thumbnail']): logging.info('processing {} for {}'.format(i, username)) same_id = session.query(InstgaramImageRss).filter( InstgaramImageRss.rss_webstagram_id == x['id']).all() # print(same_id) if same_id is not None and len(same_id) > 0: stop = True break current_tmp_filename = os.path.join( tempfile._get_default_tempdir(), next(tempfile._get_candidate_names())) await Utils.download(media['url'], path=current_tmp_filename) current_image_hash = hashlib.sha256( open(current_tmp_filename, 'rb').read()).hexdigest() same_hash = session.query(InstgaramImageRss).filter( InstgaramImageRss.image_hash == current_image_hash).all() # import json # print(json.dumps(x, indent=4)) if same_hash: os.remove(current_tmp_filename) stop = True break current_filename = urlparse( x["link"]).path[3:] + "_" + str(i) + '.jpg' new_path = os.path.join(data_directory, current_filename) shutil.move(current_tmp_filename, new_path) current_image_rss = InstgaramImageRss() current_image_rss.published = current_data current_image_rss.local_name = current_filename current_image_rss.local_path = new_path current_image_rss.rss_webstagram_id = x['id'] current_image_rss.summary = x['summary_detail']['value'] current_image_rss.media_url = media['url'] current_image_rss.image_hash = current_image_hash current_image_rss.creation_time = datetime.now() current_image_rss.link = x['link'] current_image_rss.sended = False current_image_rss.username = username # print(current_image_rss) session.add(current_image_rss) try: session.commit() except Exception: session.rollback() except Exception as e: logging.exception(str(e)) finally: session.close()
def main(): session = config.Session() for x in session.query(InstagramBotDAO.Chat).all(): print(x) print('''---------- subscriptions ------ ''') for x in session.query(InstagramBotDAO.InstagramSubscription).all(): print(x) session.close()
async def run(self): logging.info('starting instaloader registering') while True: session = config.Session() try: self.register() except Exception as e: logging.exception(e) finally: session.close() logging.info('sleeping instaloader registering...') await asyncio.sleep(config.TIME_SLEEP_REGISTER) logging.info('waking up registering')
async def _scrape(self, username): session = config.Session() try: current_sunscription = session.query(InstagramSubscription).filter(InstagramSubscription.username == username).first() await parse_user(username) current_sunscription.last_check_datetime = datetime.now() session.add(current_sunscription) try: session.commit() except Exception as e: logging.exception(e) session.rollback() except Exception as e: logging.exception(e) finally: session.close()
def scrape(scrapper): session = config.Session() logging.info('starting scrapper') for section_id, section_name in config.sections.items(): for theme in scrapper.get_list_themes(section_id=section_id): stored_themes = session.query(WatchRuDAO.WatchRuTheme)\ .filter(WatchRuDAO.WatchRuTheme.theme_id==theme['id'] and WatchRuDAO.WatchRuTheme.section_id==section_id).all() if len(stored_themes) == 0: logging.info('new theme, not stored in db -> adding') ret_screens = scrapper.get_screenshot(theme['url']) parsed_theme = WatchRuDAO.WatchRuTheme(theme_id=theme['id'], theme_name=theme['name'], section_id=section_id, section=section_name, pdf_path=ret_screens['pdf'], screenshot_path=ret_screens['jpeg'], last_update=theme['update_time'], sended=False, was_updated=False ) session.add(parsed_theme) elif len(stored_themes) == 1: stored_theme = stored_themes[0] logging.info('old theme, i need check time update') if theme['update_time'] <= stored_theme.last_update: continue ret_screens = scrapper.get_screenshot(theme['url']) stored_theme.last_update = theme['update_time'] stored_theme.was_updated = True stored_theme.sended = False stored_theme.pdf_path = ret_screens['pdf'] stored_theme.screenshot_path = ret_screens['jpeg'] else: logging.warning('wtf too many themes with same id and section') continue try: session.commit() except: session.rollback() scrapper.close() session.close() logging.info('finished')
async def run(self): logging.info('starting subscription') while True: session = config.Session() try: logging.info('starting data') session.query() usernames = session.query(InstagramSubscription).filter(InstagramSubscription.subscribed == True).all() if usernames is None: logging.info('sleeping') await asyncio.sleep(config.TIME_SLEEP) continue for username in usernames: await self._scrape(username.username) except Exception as e: logging.exception(e) finally: session.close() logging.info('sleeping') await asyncio.sleep(config.TIME_SLEEP)
async def run(self): logging.info('starting instaloader') while True: session = config.Session() try: logging.info('starting data') session.query() usernames = session.query(InstagramSubscription).filter( InstagramSubscription.subscribed == True).all() if usernames is None: logging.info('sleeping') await asyncio.sleep(config.TIME_SLEEP) continue for username in usernames: logging.info('username===={}'.format(username)) await self.scrape(username.username) except Exception as e: logging.exception(e) finally: session.close() logging.info('sleeping...') await asyncio.sleep(config.TIME_SLEEP_INSTALOADER_PARSER) logging.info('waking up')
def register(self): session = config.Session() try: for username in os.listdir(self.folder_for_registering): current_directory = os.path.join(self.folder_for_registering, username) filepaths = set([]) date_to_files = defaultdict(list) for file in os.listdir(current_directory): current_filepath = os.path.join(current_directory, file) filepaths.add(current_filepath) parsed_filename = re.search( '(?P<date>.*)_UTC(_((?P<index>\d+)|(?P<type>[\w|_]+)))?\.(?P<extension>jpg|json|txt|mp4)', file) if not parsed_filename: logging.warning( 'ignored file = {}'.format(current_filepath)) continue filetype = parsed_filename.group('extension') if filetype not in ['jpg', 'mp4']: continue current_filedate = parsed_filename.group('date') current_filedatedate = datetime.datetime.strptime( current_filedate, "%Y-%m-%d_%H-%M-%S") index = parsed_filename.group('index') or 1 ret = session.query(InstagramImageNoRss).\ filter(InstagramImageNoRss.publication_index==index).\ filter(InstagramImageNoRss.username==username).\ filter(InstagramImageNoRss.local_path==current_filepath).\ filter(InstagramImageNoRss.published==current_filedatedate).all() if ret and len(ret) > 0: continue current_db_object = InstagramImageNoRss() current_db_object.username = username current_db_object.local_path = current_filepath current_db_object.published = current_filedatedate local_path_txt = os.path.join( current_directory, current_filedate + '_UTC.txt') if os.path.exists(local_path_txt): current_db_object.local_path_txt = local_path_txt with open(local_path_txt, 'r', encoding='utf8') as localtxtf: current_db_object.text_data = '\n'.join( localtxtf.readlines()) local_path_json = os.path.join( current_directory, current_filedate + '_UTC.json') if os.path.exists(local_path_json): current_db_object.local_path_json = local_path_json with open(local_path_json, 'r', encoding='utf8') as localjsonf: current_db_object.json_data = json.dumps( json.loads(' '.join(localjsonf.readlines()))) comments_path = os.path.join( current_directory, current_filedate + '_UTC_comments.json') if os.path.exists(comments_path): current_db_object.comments_path = comments_path with open(comments_path, 'r', encoding='utf8') as commentsjsonf: current_db_object.comments_data = json.dumps( json.loads(' '.join( commentsjsonf.readlines()))) location_path = os.path.join( current_directory, current_filedate + '_UTC_location.txt') if os.path.exists(location_path): current_db_object.geolocation_path = location_path with open(location_path, 'r', encoding='utf8') as locationf: current_db_object.geolocation_data = '\n'.join( locationf.readlines()) current_db_object.sended = False current_db_object.publication_index = index session.add(current_db_object) try: session.commit() except Exception as e: logging.exception(e) session.rollback() except Exception as e: logging.exception(e) finally: session.close()
async def on_chat_message(msg): session = config.Session() try: content_type, chat_type, chat_id = telepot.glance(msg) if content_type != 'text': return command = msg['text'][:].lower() if 'text' in msg and msg['text'] == '/start': new_chat_info = session.query(Chat).filter( Chat.chat_id == chat_id).first() if new_chat_info is None: new_chat_info = Chat(chat_id=chat_id, admin=False, tg_ans=str(msg)) await bot.sendMessage(chat_id, u'Шо?! Новый пользователь?!') else: await bot.sendMessage(chat_id, u'А я тебя уже знаю') session.add(new_chat_info) try: session.commit() except Exception as e: logging.exception(e) session.rollback() elif command.startswith('/client'): passw = re.search('\/admin\s+(?P<passw>\w+)', msg['text']) elif command.startswith('/admin'): passw = re.search('\/admin\s+(?P<passw>\w+)', msg['text']) if passw and passw.group('passw') == config.BOT_ADMIN_PASSWORD: current_chat_info = session.query(Chat).filter( Chat.chat_id == chat_id).first() current_chat_info.admin = True session.add(current_chat_info) try: session.commit() except Exception as e: session.rollback() await bot.sendMessage(chat_id, u'Слушаю и повинуюсь, хозяин') await _send_help(chat_id) else: await bot.sendMessage( chat_id, u'ЭЭЭ ТЫ КТО ТАКОЙ? ДАВАЙ ДО СВИДАНИЯ!') elif command == '/help': await _send_help(chat_id) elif command.startswith('/regex'): chats = session.query(Chat).filter( Chat.chat_id == chat_id).all() if not (chats and len(chats) > 0): return if command.startswith('/regex_add'): regex_text = re.search('\/regex_add\s+(?P<regex_text>.*)', msg['text']) if regex_text and regex_text.group('regex_text'): regex_obj = Regexes( chat_id=chat_id, regex=regex_text.group( 'regex_text').rstrip().lstrip()) session.add(regex_obj) try: session.commit() except Exception as e: session.rollback() await bot.sendMessage( chat_id, u'Added regex {}'.format(regex_text)) else: await bot.sendMessage(chat_id, u'Not found regex') elif command.startswith('/regex_remove'): uid_remove = re.search('\/regex_remove_(?P<uid>\w+)', msg['text']) if uid_remove and uid_remove.group('uid'): regex_obj = session.query(Regexes).filter( Regexes.id == uid_remove.group('uid')).first() session.delete(regex_obj) try: session.commit() await bot.sendMessage( chat_id, u'Removed regex <b>{}</b>'.format( regex_obj.regex), parse_mode='html') except Exception as e: session.rollback() await bot.sendMessage( chat_id, u'Error on removing regex <b>{}</b> with exception {}' .format(regex_obj.regex, str(e)), parse_mode='html') else: await bot.sendMessage( chat_id, u'Error on removing regex not found uid') else: regexes = session.query(Regexes).filter( Regexes.chat_id == chat_id).all() for regex in regexes: await bot.sendMessage( chat_id, u"Регулярное выражение <b>{0}</b>. Удалить /regex_remove_{1}" .format(regex.regex, regex.id), parse_mode='html') else: await bot.sendMessage(chat_id, 'Неизвестная команда.') except Exception as e: logging.exception(e) for chat_id in [ x.chat_id for x in session.query(Chat).filter( Chat.admin == True).all() ]: await bot.sendMessage(chat_id=chat_id, text='Exception {} with {}'.format( str(e), traceback.format_exc())) finally: session.close()
async def send_themes(): while True: logging.info('sending themes to admins') session = config.Session() try: themes_to_send = session.query(WatchRuTheme).filter( WatchRuTheme.sended == False).all() for theme_to_send in themes_to_send: # print(theme_to_send) if theme_to_send.was_updated: message = '''В секции форума <b>{}</b> обновилась тема:\n <i>{}</i>\n __Обновлена__: {}\nurl: http://forum.watch.ru/showthread.php?t={}'''\ .format(theme_to_send.section.replace('"', ''), theme_to_send.theme_name.replace('http://forum.watch.ru/images/market-question.png', ''), theme_to_send.last_update, theme_to_send.theme_id) else: message = '''В секции форума <b>{}</b> добавлена тема:\n <i>{}</i>\n __Добавлена__: {}\nurl: http://forum.watch.ru/showthread.php?t={}''' \ .format(theme_to_send.section.replace('"', ''), theme_to_send.theme_name.replace('http://forum.watch.ru/images/market-question.png', ''), theme_to_send.last_update, theme_to_send.theme_id) for chat_id in [ x.chat_id for x in session.query(Chat).filter( Chat.admin == True).all() ]: regexes = session.query(Regexes).filter( Regexes.chat_id == chat_id).all() if any([ re.search(y.regex, theme_to_send.theme_name) for y in regexes ]): await bot.sendMessage(chat_id=chat_id, text=message, parse_mode='html') try: with open(theme_to_send.screenshot_path, 'rb') as cf: await bot.sendDocument( chat_id=chat_id, document=cf, caption=theme_to_send.theme_name[:50]) #await bot.sendPhoto(chat_id=chat_id, photo=cf, caption=theme_to_send.theme_name[:50]) except Exception as e: logging.exception(e) theme_to_send.sended = True theme_to_send.was_updated = False session.add(theme_to_send) try: session.commit() except Exception as e: logging.exception(e) session.rollback() except Exception as e: logging.exception(e) for chat_id in [ x.chat_id for x in session.query(Chat).filter( Chat.admin == True).all() ]: await bot.sendMessage(chat_id=chat_id, text='Exception {} with {}'.format( str(e), traceback.format_exc())) finally: session.close() await asyncio.sleep(config.TIME_SLEEP_SENDER)