def schedule(destination, now): now = int(now) - 1 later = int(now) + 1 day = time.strftime("%d", time.localtime()) month = time.strftime("%m", time.localtime()) year = time.strftime("%Y", time.localtime()) # pl catalunya:78805, mataro:79500, sants:71801, sant adria:79403, (horariDesde, horariFins + nomes hora) values = {'day': day, 'month': month, 'year': year, 'sourceCode': '79500', 'destinationCode': destination, 'fromtime': now, 'totime': later} data = urlencode(values) url = "http://www14.gencat.cat/mobi_rodalies/AppJava/pages/horaris/ResultatCerca.htm" page = get_page(url, data) p = WebParser() p.feed(page) timetables = p.dom.get_element_by_id('timetablesTable') schedules = timetables.get_elements_by_tag('li') time_table = {} for schedule_item in schedules: item_id = schedule_item.get_id() departure = schedule_item.get_elements_by_class('departureTime') arrival = schedule_item.get_elements_by_class('arrivalTime') triptime = schedule_item.get_element_by_id('tripTimeText') if departure != [] or arrival != []: time_table[item_id] = {'departureTime': departure[0].get_text(), 'arrivalTime': arrival[0].get_text(), 'tripTime': triptime.get_text()} return time_table
def __init__(self): self.gui = GUI() self.web_parser = WebParser("www.devrant.io") self.executed_commands = [] self.page = 0 self.limit = 20 self.sort = "recent" self.state = "NO_STATE"
def stations(): values = {'lineId': 'R1'} data = urlencode(values) url = "http://www14.gencat.cat/mobi_rodalies/AppJava/pages/linies/Detall.htm" page = get_page(url, data) p = WebParser() p.feed(page) station_list = p.dom.get_elements_by_class('RodaliesList')[0] station_list = station_list.get_elements_by_tag('li') result = {} for sl in station_list: station_id = sl.get_id()[9:] station_name_elem = sl.get_elements_by_class('stationName')[0] station_name_elem = station_name_elem.get_elements_by_tag('xml-fragment')[0] station_name = station_name_elem.get_text() result[station_id] = station_name return result
def collect_links_and_data(self, page_url): # Fixes ssl issue for some mac users if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)): ssl._create_default_https_context = ssl._create_unverified_context try: html_string = "" response = urlopen(page_url) if "text/html" in response.getheader( "Content-Type"): # Check to see if HTML response html_bytes = response.read() # Read the bytestream in response html_string = html_bytes.decode( "utf-8") # Decode bytestream as utf-8 parser = WebParser( self.base_url ) # Initialise custom webparser with html response parser.feed(html_string) # Execute parser self.data_list = parser.get_data_with_tags( ) # Retrieve datalist from parser except Exception as e: print("Error: " + str(e)) print("Program will terminate") sys.exit() return parser.get_page_urls()
def collect_links_and_data(self, page_url): try: html_string = "" response = urlopen(page_url) if "text/html" in response.getheader( "Content-Type"): # Check to see if HTML response html_bytes = response.read() # Read the bytestream in response html_string = html_bytes.decode( "utf-8") # Decode bytestream as utf-8 parser = WebParser( self.base_url ) # Initialise custom webparser with html response parser.feed(html_string) # Execute parser self.data_list = parser.get_data_with_tags( ) # Retrieve datalist from parser except Exception as e: print("Error: " + str(e)) print("Program will terminate") sys.exit() return parser.get_page_urls()
from fastapi import FastAPI from fastapi import BackgroundTasks from models import AvitoPair from db_handler import DataBaseHandler from web_parser import WebParser import datetime import asyncio app = FastAPI() # API app data_base_handler = DataBaseHandler( path="Avito") # handles all database operations web_parser = WebParser() # handles all operations with the web site scrapping loop = asyncio.get_event_loop() # event loop of asyncio async def update_timestamps(pair: AvitoPair): """ Updates all information about timestamps and counters for the pair Updates information about top posts for the pair :param pair: pair of phrase and region """ while True: # adds data about timestamps in the database: add_timestamp(pair=pair) # Adds top 5 links for the pair in the database add_top_posts(pair=pair) await asyncio.sleep(3600) # sleeps for an hour
class InputHandler: __ADDRESS_BASE = "/api/devrant/" def __init__(self): self.gui = GUI() self.web_parser = WebParser("www.devrant.io") self.executed_commands = [] self.page = 0 self.limit = 20 self.sort = "recent" self.state = "NO_STATE" def handle_input(self): running = True while (running == True): user_input = self.gui.show_menu("NO_STATE") if 'exit' == user_input: return command = self.get_command(user_input) if command[0] == True: if 'r' != user_input and 'b' != user_input and 'n' != user_input: self.executed_commands.append(user_input) rants = self.web_parser.execute_command(command[1], command[2]) if None != rants: self.gui.print_rants(rants, int((self.page / self.limit) + 1)) def get_command(self, user_input): retval = [False, None, None] if 'r' == user_input: if len(self.executed_commands) > 0: return self.get_command(self.executed_commands[-1]) if 'b' == user_input: if len(self.executed_commands) > 0: if self.page - self.limit >= 0 and True == self.__pageState( self.state): self.page -= self.limit elif len(self.executed_commands) > 1: self.executed_commands.pop() return self.get_command(self.executed_commands[-1]) if 'n' == user_input: if len(self.executed_commands) > 0: self.page += self.limit return self.get_command(self.executed_commands[-1]) if 'surprise' == user_input: self.__set_state("SURPRISE") retval[1] = InputHandler.__ADDRESS_BASE + "rants/" + user_input retval[2] = {'app': 3} retval[0] = True res = re.match('sort(\s)(algo|recent)', user_input) if res: self.sort = res.group(2) self.page = 0 res = re.match('view(\s)(\d+)', user_input) if res: self.__set_state("VIEW") self.limit = int(res.group(2)) retval[1] = InputHandler.__ADDRESS_BASE + "rants" retval[2] = { 'app': 3, 'sort': self.sort, 'limit': self.limit, 'skip': self.page } retval[0] = True else: res = re.match('view', user_input) if res: self.__set_state("VIEW") self.limit = 20 retval[1] = InputHandler.__ADDRESS_BASE + "rants" retval[2] = { 'app': 3, 'sort': self.sort, 'limit': self.limit, 'skip': self.page } retval[0] = True res = re.match('search(\s)(.*)', user_input) if res: self.__set_state("SEARCH") retval[1] = InputHandler.__ADDRESS_BASE + "search" retval[2] = {'app': 3, 'term': res.group(2), 'skip': self.page} retval[0] = True res = re.match('top(\s)(\d+)', user_input) if res: self.__set_state("TOP") self.limit = int(res.group(2)) retval[1] = InputHandler.__ADDRESS_BASE + "rants" retval[2] = { 'app': 3, 'sort': 'top', 'limit': self.limit, 'skip': self.page } retval[0] = True res = re.match('get(\s)(\d+)', user_input) if res: self.__set_state("GET") retval[1] = InputHandler.__ADDRESS_BASE + "rants/" + res.group(2) retval[2] = {'app': 3} retval[0] = True return retval def __set_state(self, state): if self.state != state: if self.__pageState(state) and self.__pageState(self.state): self.page = 0 self.limit = 20 self.state = state def __pageState(self, state): if state == "VIEW" or state == "TOP" or state == "SEARCH": return True return False
def main(): #check if there is another instance pid_file = 'program.pid' fp = open(pid_file, 'w') try: fcntl.lockf(fp, fcntl.LOCK_EX | fcntl.LOCK_NB) except IOError: sys.exit('Another instance is running') # get settings from cinfig.ini.sample file time_format = "%a %b %d %H:%M:%S %Y" action_info = {} start = datetime.now() action_info['start_time'] = start.strftime(time_format) action_info['machine_ip'] = socket.gethostbyname(socket.gethostname()) action_info['directory_name'] = os.getcwd() link = ' ' command_line = link.join(sys.argv) action_info['command_line'] = command_line try: action_info['version'] = pkg_resources.get_distribution( 'nrc_ngs_dl').version except: pass try: args = parse_input_args(sys.argv[1:]) except: sys.exit('Usage: lims_downloader -c /path/to/configuation.ini') if not args.config_file: sys.exit('Usage: lims_downloader -c /path/to/configuation.ini') config_file = args.config_file if os.path.exists(config_file) == False: sys.exit('Error: config_file %s not exist' % config_file) try: config_setting = ConfigSetting(config_file) except IOError: sys.exit('Cannot open file:' + config_file + '; Cannot get the configuration settings') #set up logging try: set_up_logging(config_setting.log_file_name, config_setting.log_name, config_setting.log_level) except: sys.exit('Cannot locate the log file ' + config_setting.log_file_name) logger = logging.getLogger('nrc_ngs_dl.lims_downloader') if os.path.exists(config_setting.destination_folder) == False: logger.info('folder %s not exist, create the folder' % config_setting.destination_folder) try: os.makedirs(config_setting.destination_folder) except: logger.error('Cannot create the destination folder %' % config_setting.destination_folder) sys.exit(1) if os.access(config_setting.destination_folder, os.R_OK or os.W_OK) == False: logger.error('Do not have permission to access the %s' % config_setting.destination_folder) sys.exit(1) #connect to database if the database exist #otherwise create tables for this database try: lims_database = LimsDatabase(config_setting.db_name) except: #if lims_database is None: logger.error('Cannot access the database %s' % config_setting.db_name) sys.exit('Cannot access the database ' + config_setting.db_name) action_id = lims_database.insert_action_info(action_info) #login to LIMS webpage try: logger.info('Logging into NRC-LIMS web page ') web_parser = WebParser(config_setting.login_url, config_setting.runlist_url, config_setting.username, config_setting.password) except: logger.error('Failed to log in') sys.exit(1) #get a list of all the completed sequence runs #information for each run : url_for_the_run, run_name, plate_name, #Plateform, Operator, Creation Date, Description, status try: logger.info('Getting run list') run_list = web_parser.get_runlist(config_setting.table_run_list, config_setting.column_run_link, config_setting.column_run_status) except: logger.error('Cannot get the list of sequence runs') sys.exit(1) mapping_file = config_setting.mapping_file_name if not os.path.exists(mapping_file): mapping_backup = open(mapping_file, 'w') mapping_backup.write('run_name\trun_description\n') mapping_backup.flush() mapping_backup.close() #for each sequence run in the list, #1. check if it is a new data or re-processed data #2. in the case of reprocessed data: remove the data and related information in the sqlite database #3. in the case of new/reprocessed data: download the data, insert the information of the data into database tables package_downloaded = 0 number_tries = int(config_setting.number_retries) + 1 while number_tries > 0: logger.info('==== number of tries: %s ' % (int(config_setting.number_retries) + 2 - number_tries)) number_tries -= 1 retry_list = [] for run_url in run_list: try: run_info = web_parser.get_runinfo(run_url) except: logger.warn('Cannot get run_info for run_url ( %s )' % (run_url)) retry_list.append(run_url) continue try: lane_list, file_list = web_parser.get_laneinfo( run_url, config_setting.table_file_list, config_setting.column_lane, config_setting.column_file_link) except: logger.warn( 'Cannot get lane_list and file_list for run_name %s)' % (run_info['run_name'])) retry_list.append(run_url) continue multiple_lane = len(lane_list) for a_lane in lane_list: folder_name = run_info['run_name'] if multiple_lane > 1: folder_name = run_info['run_name'] + '_lane' + str( a_lane['lane_index']) #if int(a_lane['http_content_length']) > 10700000000: # logger.warn('Data size %s > 10GB, skip the data' % (a_lane['http_content_length'])) # continue case = lims_database.get_run_case(run_info, a_lane) if case == lims_database.RUN_OLD: logger.info( 'Data already downloaded (run_name %s, lane_index %s)' % (run_info['run_name'], a_lane['lane_index'])) if case == lims_database.RUN_REPROCESSED: logger.info( 'Deleting records in database for re-processed data (run_name %s, lane_index %s)' % (run_info['run_name'], a_lane['lane_index'])) lims_database.delete_old_run(run_info, a_lane) if case == lims_database.RUN_REPROCESSED or case == lims_database.RUN_NEW: logger.info( 'Downloading new/re-processed data (run_name %s, lane_index %s)' % (run_info['run_name'], a_lane['lane_index'])) output_path = os.path.join( config_setting.destination_folder, a_lane['package_name']) time_and_size = web_parser.download_zipfile( a_lane['pack_data_url'], output_path) if a_lane['http_content_length'] != time_and_size[2]: logger.warn( 'Downloaded file size %s is different with the http_content_length %s' % (time_and_size[2], a_lane['http_content_length'])) os.unlink(output_path) retry_list.append(run_url) else: sequence_run = SequenceRun( a_lane, folder_name, file_list, config_setting.destination_folder, config_setting.folder_mode, config_setting.file_mode) if sequence_run.unzip_package( time_and_size[2], a_lane['http_content_length']): sequence_run.rename_files() package_downloaded += 1 mapping_backup = open(mapping_file, 'a') a_string = run_info['run_name'] + '\t' + run_info[ 'description'] + '\n' mapping_backup.write(a_string) mapping_backup.flush() mapping_backup.close() rowid = lims_database.insert_run_info( run_info, action_id) lims_database.insert_file_info( rowid, sequence_run.file_info, a_lane['lane_index']) lims_database.insert_package_info( rowid, time_and_size) lims_database.insert_lane_info( rowid, run_url, a_lane) lims_database.update_package_downloaded( package_downloaded, action_id)