def gather_process(): logger.info("gather") storage = FileStorage(SCRAPPED_FILE) # You can also pass a storage scrapper = Scrapper() scrapper.scrap_process(storage)
def gather_process(): logger.info("gather") storage = FileStorage(SCRAPPING_FILE) # You can also pass a storage scrapper = Scrapper() scrapper.scrap_process(storage, SCRAPPING_DIRECTORY, SCRAPPING_PAGES_COUNT)
def gather_process(obj_count): logger.info("gather") storage = FileStorage(SCRAPPED_JSON) # You can also pass a storage scrapper = Scrapper(obj_count) scrapper.scrap_process(storage)
def gather_process(): logger.info("gather") storage_authors = FileStorage(SCRAPPED_AUTHORS) storage_author_info = FileStorage(SCRAPPED_AUTHOR_INFO) scrapper = Scrapper() scrapper.scrap_process(storage_authors, storage_author_info)
def gather_process(): logger.info("gather") storage = FileStorage(SCRAPPED_FILE) # You can also pass a storage scrapper = Scrapper(limit=VACANCIES_LIMIT, per_page=VACANCIES_PER_PAGE, area=VACANCIES_SEARCH_AREA, specialization=SPECIALIZATION) scrapper.scrap_process(storage)
def parse_tickers(sect, base_url): # url = 'https://www.estimize.com/sectors/{sector}?per={max_tickers}'.format(sector=sect, max_tickers=MAX_TICKERS) url = base_url.format(sector=sect, max_tickers=MAX_TICKERS) scraper = Scrapper() soup = BeautifulSoup(scraper.scrap_process(url), 'lxml') # lxm parser tickers_table_dict = {} # опредилим год отсчета season_txt = soup.find('div', {'class': 'season'}).find('strong').text year_txt = season_txt.split(' ')[1] # скачаем таблицу тикеров tickers_html = soup.find('div', {'class': 'linked-table'}) items = tickers_html.find_all( 'a', {'class': ['linked-row opened', 'linked-row closed']}) for item in items: line_dict = {} ticker_nm = item.find('div', { 'class': 'td symbol' }).text.replace('\n', '') if ticker_nm not in tickers_table_dict.keys( ): # бывает, что в 1-м квартале 2 отчета, оставляем самый ранний date_str = item.find('div', { 'class': 'td reports' }).text.replace('\n', '')[:-3] + ' ' + year_txt line_dict['Report'] = pd.to_datetime(date_str, format='%b %d %Y') line_dict['Time'] = item.find('div', { 'class': 'td reports' }).text.replace('\n', '')[-3:] line_dict['Qurter'] = item.find('div', { 'class': 'td quarter' }).text.replace('\n', '') line_dict['Sector'] = sect tickers_table_dict[ticker_nm] = line_dict df_out = pd.DataFrame(tickers_table_dict).T df_out.index.rename('tic', inplace=True) return df_out
def gather_process_test(): logger.info("gather") storage = FileStorage(SCRAPPED_FILE) parser = html_parser.HtmlParser(fields=[]) # You can also pass a storage # data_dict = {'marka' : [], 'model' : [], 'price' : [], 'year' : [], 'probeg' : [], 'owners' : [], 'photo_len' : []} scrapper = Scrapper() url = START_SCRAPING_PAGE.replace('###p=1###', 'p=1') response = scrapper.scrap_process(url=url) data = parser.get_parsed_data_test(response.text, DOMEN)
def gather_process(): logger.info("gather") storage = JSONStorage(SCRAPPED_FILE) quotes_storage = JSONStorage(QUOTES_FILE) # You can also pass a storage # Search vacancies using popular job title name in data analytics # https://blog.udacity.com/2018/01/4-types-data-science-jobs.html search_texts = ['Data Analyst', 'Machine Learning Engineer', 'Data Engineer', 'Data Scientist'] search_params = {'page': 0, 'no_magic': False, 'period': 30, 'only_with_salary': False} # Scrape information about vacancies scrapper = Scrapper() scrapper.scrap_process(storage, search_texts, search_params) # Scrape information about current quotes for currencies vs RUR quotes_scrapper = QuotesScrapper() quotes_scrapper.scrap_process(quotes_storage, 'RUR')
def gather_process(): logger.info("gather") storage = FileStorage(SCRAPPED_FILE) print("Hello") session = vk.Session() vk_api = vk.API(session) members = vk_api.groups.getMembers(group_id='bolshe_buketa', v=5) i = 0 with open('list_to_csv.csv', 'w', newline='') as csv_file: csv_writer = csv.writer(csv_file) for vk_user_id in members['users']: time.sleep(1) user = vk_api.users.get( user_id=vk_user_id, v=5, fields='name, online,bdate,city,sex,about,connections,contacts' )[0] if 'home_phone' in user: user['home_phone'] = user['home_phone'].replace( '\u2665', '').replace('\u2605', '').replace('\u260e', '').replace(':', '').replace(',', '') if 'about' in user: user['about'] = user['about'].replace('\u2665', '').replace( '\u2605', '').replace('\u260e', '').replace(':', '').replace(',', '') if 'city' in user: city = vk_api.database.getCitiesById(city_ids=user['city'], v=5) if user['city'] != 0: user['city_name'] = city[0]['title'].replace(':', '') else: user['city_name'] = '' del user['city'] i = i + 1 print(i) print(user) try: csv_writer.writerow([user]) except: user['about'] = 'Не удалось декодировать и записать' try: csv_writer.writerow([user]) except: user['home_phone'] = 'Не удалось декодировать и записать' csv_writer.writerow([user]) print('Done') # You can also pass a storage scrapper = Scrapper() scrapper.scrap_process(storage)
def gather_process(pageCounts): logger.info("gather") scrapper = Scrapper(pageCounts) scrapper.scrap_process()
def gather_process(): logger.info("gather") scrapper = Scrapper() scrapper.scrap_process(SCRAPPED_FILE)