def __init__(self, start_url, domain): """ initial method: - initiates helper class; - checks the temp directory existance """ self.helper = DirectoryHelper() #self.helper.prepare_working_directory() try: self.work_dir = self.helper.work_dir except: self.work_dir = '__temp__' if not os.path.exists(self.work_dir): os.makedirs(self.work_dir) if start_url is None: print '[e] specify start URL' start_url = BSCrawler.adjust_url(start_url) self.urls = SetQueue() self.urls.put(start_url) self.domain = domain self.website_graph = {} print '[i] files will be saved into folder "{0}"'.format(self.work_dir)
class WordCloudHelper(): """Class that is responsinble for data masks and fonts.""" def __init__(self, work_dir=None): """ (obj, str) -> None Initializing the class. """ # assert work_dir is not None, \ # "work_dir should not be None" self.dir_helper = DirectoryHelper(work_dir) self.save_dir = self.dir_helper.work_dir self.dir_helper.prepare_working_directory() print '[i] working directory prepared' def load_fonts(self, selected_fonts=None): """ (obj, list) -> dict Loading fonts as specified in the list or by itereting folder with fonts. """ BASE_FOLDER = self.dir_helper.upper_directory() + 'fonts\\' fonts = {} if selected_fonts is not None: for font in selected_fonts: fonts[font] = BASE_FOLDER + font + '.ttf' else: files = [f for f in os.listdir(BASE_FOLDER)] for f in files: if (f[-4:].lower() in ('.ttf')): fonts[f[:-4]] = BASE_FOLDER + f return fonts def load_masks(self, selected_masks=None): """ (obj, list) -> dict Loading masks as specified in the list or by itereting folder with masks. """ BASE_FOLDER = self.dir_helper.upper_directory() + 'masks\\' masks = {} if selected_masks is not None: for mask in selected_masks: masks[mask] = BASE_FOLDER + mask + '.png' else: files = [f for f in os.listdir(BASE_FOLDER)] for f in files: if (f[-4:].lower() in ('.png')): masks[f[:-4]] = BASE_FOLDER + f return masks
def __init__(self, work_dir=None): """ (obj, str) -> None Initializing the class. """ # assert work_dir is not None, \ # "work_dir should not be None" self.dir_helper = DirectoryHelper(work_dir) self.save_dir = self.dir_helper.work_dir self.dir_helper.prepare_working_directory() print '[i] working directory prepared'
def config_stopwords(self): """ (obj) -> None """ # processing custom stop words from specified files custom_stopwords = list() dh = DirectoryHelper() for sw_file in stop_words_files: sw_fullpath = stop_words_folder + '\\' + sw_file sw_file = dh.read_file_utf8(sw_fullpath) for sw_single in sw_file.split('\n'): custom_stopwords.append(sw_single[:-1]) # adding fetched stopwords super(HabrServiceToWC, self).config_stopwords(custom_stopwords)
def __init__(self): """ initial method: - initiates helper class; - checks the temp directory existance """ self.helper = DirectoryHelper() #self.helper.prepare_working_directory() try: self.work_dir = self.helper.work_dir except: self.work_dir = '__temp__' if not os.path.exists(self.work_dir): os.makedirs(self.work_dir) print '[i] files will be saved into folder "{0}"'.format(self.work_dir)
def __init__(self): """ Initial method that: - initiates helper class; - checks the temp directory existence; """ self.logger = hlp.custom_logger() self.helper = DirectoryHelper() #self.helper.prepare_working_directory() try: self.work_dir = self.helper.work_dir except: self.work_dir = '__temp__' if not os.path.exists(self.work_dir): os.makedirs(self.work_dir) self.logger.info('[i] files will be saved into folder "{0}"'.format(self.work_dir))
class BSCrawler(): """ Crawling the HTML page and fetching data into table forms.""" UA = 'Mozilla/5.0 (X11; U; FreeBSD i386; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' def __init__(self): """ Initial method that: - initiates helper class; - checks the temp directory existence; """ self.logger = hlp.custom_logger() self.helper = DirectoryHelper() #self.helper.prepare_working_directory() try: self.work_dir = self.helper.work_dir except: self.work_dir = '__temp__' if not os.path.exists(self.work_dir): os.makedirs(self.work_dir) self.logger.info('[i] files will be saved into folder "{0}"'.format(self.work_dir)) def crawl(self, mergedata, urlfile=None): """Method that extracts URLs from given file and process them. Args: urlfile: file that contains URLS to be processed """ self.logger.info('[i] given URls will be processed') # data buffer for all processed URLs data = [] if urlfile: with codecs.open(urlfile, 'r', encoding='utf8') as f_urls: for line in f_urls: stripped = line.strip() if not stripped.startswith('#') and not len(stripped) == 0 and stripped.startswith('http'): self.logger.info('[i] following URL is going to be parsed:\n {0}'.format(stripped)) try: doc = self.download_document(stripped) data += self.process_uol_bibliography_tbl(doc) sleep(SLEEP_TIME_IN_SECONDS) except Exception as ex: self.logger.error('[e] exception: {0}, arguments: {1}'.format(ex.message, ex.args)) # else: # self.logger.info('[i] following URL was ignored:\n {0}'.format(stripped)) # processing graduated PhDs of Computer Science url_graduated = 'http://www.uni-oldenburg.de/informatik/studium-lehre/promotion/promotionen/' sleep(SLEEP_TIME_IN_SECONDS) doc_gradauted = self.download_document(url_graduated) self.process_uol_graduated_phds(doc=doc_gradauted, output_file_name='cs-graduated-phds') # merging together all processed data if mergedata: csv = self.data_as_csv(data) self.helper.save_file(os.path.join(self.work_dir, 'uolbibliography-merged.csv'), csv) self.logger.info('[i] given URls were processed') def download_document(self, url): """ Downloading HTML page and storing inside string. Args: url: URL to be downloaded Returns: downloaded HTML """ html = None try: req = urllib2.Request(url=url, headers={'User-Agent': self.UA}) hdl = urllib2.urlopen(req) html = hdl.read() except Exception as ex: self.logger.error('[e] exception: {0}, arguments: {1}'.format(ex.message, ex.args)) return html def get_data_from_table(self, table_body): """ Getting data from HTML table with BS.""" data = [] rows = table_body.find_all('tr') for row in rows: cols = row.find_all('td') cols = [ele.text.strip() + " " for ele in cols] # Add extra space for empty values #data.append([elem for ele in cols if ele]) # Get rid of empty values data.append([ele.strip() for ele in cols]) # Take all values return data def validate_file_name(self, file_name): """ Removes all symbols that are not file name conform. Args: file_name: name of the file to be validated Returns: valid name of the file """ valid_chars = '-_.() abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' file_name = ''.join(c for c in file_name if c in valid_chars) return file_name # def save_images(self, img_list, folder): # """ # (obj, list, str) -> None # Saving images to the folders # """ # img_dir = self.work_dir + folder + '\\' # self.helper.create_directory_ondemand(img_dir) # for img in img_list: # img_full_url = img['src'] # if img['src'][:4] != 'http': # img_full_url = self.main_url + img['src'] # img_extention = img_full_url[img_full_url.rfind('.'):] # image = self.download_document(img_full_url) # if image is not None: # gen_img_name = img_dir + self.helper.gen_file_name(extention='') # self.helper.save_img_file(gen_img_name + img_extention, image) # else: # self.logger.info('[i] this image is not found: {0}'.format(img_full_url)) def decode_abbreviations(self, row, size = 8, lang = 'DE'): """ Decoding particular abbreviations within given data. """ if size != 8: return None mapping = {} mapping['DE'] = {'AU':'Aufsatz', 'MO': 'Monographie', 'ZS':'Zeitschrift', 'SW': 'Sammelwerksbeitrag'} mapping['EN'] = {'AU':'Article', 'MO': 'Monograph', 'ZS':'Journal', 'SW': 'ContributionsToACollectiveWork'} if row[3] in mapping[lang]: row[3] = mapping[lang][row[3]] return row def data_as_csv(self, data, size = 8): """ Getting data as CSV. """ resulting_csv = '' DELIMETER = '","' # adding header header_values = ['Fach', 'Autor/in', 'Titel', 'Typ', 'Meldetag', 'Punktzahl', ' ZahlOldenburgerAutoren', 'Jahr'] header_row = DELIMETER.join(header_values) resulting_csv = '"' + header_row + '"' + '\n' for row in data: if len(row) == size: csv_row = DELIMETER.join(value.replace('"', "") for value in self.decode_abbreviations(row)) resulting_csv += '"' + csv_row + '"' + '\n' return resulting_csv def process_uol_graduated_phds(self, doc, output_file_name = None): """ Processing given HTML to extract graduated PhDs. Args: doc: document to be processed """ bs_html = BeautifulSoup(doc, 'html5lib') # getting name of the file from HTML if output_file_name is None: output_file_name = self.helper.gen_file_name(extention='') # validating output_file_name = self.validate_file_name(output_file_name) # getting data from HTML table table = bs_html.find('table', attrs={'class':'farbe_lichtblau breite100'}) table_body = table.find('tbody') data = self.get_data_from_table(table_body) # data from list to CSV csv_data = '' for row in data: fow_as_csv = '","'.join(row) fow_as_csv = '"' + fow_as_csv + '"' + '\n' csv_data += fow_as_csv self.helper.save_file(os.path.join(self.work_dir, output_file_name + '.csv'), csv_data) def process_uol_bibliography_tbl(self, doc, output_file_name = None): """ Processing given HTML to extract publication information from UOL's Hochschulbibliografie. Args: doc: document to be processed Returns: table as a collection of Python lists """ def is_valid_row(data_chunk): """ Validating any particular row given as input. Args: data_chunk: some random row Returns: True/False according to the validity of the data """ if len(data_chunk) < 1: return False if 'Gesamtpunkte' in data_chunk[0]: return False return True bs_html = BeautifulSoup(doc, 'html5lib') # getting name of the file from HTML if output_file_name is None: div_tag = bs_html.find('div', attrs={'id' : 'inhalt', 'class':'floatbox'}) h1_tag = div_tag.find('h1') if h1_tag is not None: output_file_name = h1_tag.get_text(separator=u' ') else: output_file_name = self.helper.gen_file_name(extention='') # validating file name output_file_name = self.validate_file_name(output_file_name) # getting data from HTML table table = bs_html.find('table', attrs={'class':'infotabelle'}) table_body = table.find('tbody') data = self.get_data_from_table(table_body) # cleaning table data cleaned_data = [] for row in data: if is_valid_row(row): cleaned_data.append(row) csv_data = self.data_as_csv(cleaned_data) # text data for debugging, uncomment if needed # prettified_html = bs_html.prettify() # self.helper.save_file(os.path.join(self.work_dir, output_file_name + '.html'), prettified_html) # text_from_html = bs_html.get_text() # self.helper.save_file(os.path.join(self.work_dir, output_file_name + '.txt'), text_from_html) # get proper file name length target_file = os.path.join(self.work_dir, output_file_name + '.csv') if len(target_file) > 250: target_file = target_file[:250] + '.csv' #self.logger.info(target_file) self.helper.save_file(target_file, csv_data) return cleaned_data
class BSCrawler(): UA = 'Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.9.2.9) Gecko/20100913 Firefox/3.6.9' def __init__(self): """ initial method: - initiates helper class; - checks the temp directory existance """ self.helper = DirectoryHelper() #self.helper.prepare_working_directory() try: self.work_dir = self.helper.work_dir except: self.work_dir = '__temp__' if not os.path.exists(self.work_dir): os.makedirs(self.work_dir) print '[i] files will be saved into folder "{0}"'.format(self.work_dir) def crawl(self): """ (obj) -> None Method that extracts urls from 'dispatcher' dictionary and process them. """ for url in self.dispatcher: print '[i] following url is going to be parsed:\n {0}'.format(url) self.main_url = url doc = self.download_document(url) self.dispatcher[url](self, doc) def crawl_dynamic(self): """ (obj) -> None Method that extracts urls from 'dispatcher' in a dynamic way. It means that the initial URL patter should contain '%s' string patter to be formatted. """ for url in self.dispatcher_dynamic: print '[i] following url is going to be parsed in a dynamic way:\n {0}'.format(url) self.dynamic_url = url tag = self.dispatcher_dynamic[url][1] self.dispatcher_dynamic[url][0](self, tag) def download_document(self, url): """ (obj,str) -> (str) Downloading html page and storing inside string. """ html = None try: req = urllib2.Request(url=url, headers={'User-Agent': self.UA}) hdl = urllib2.urlopen(req) html = hdl.read() except Exception as ex: print '[e] exception: {0}, arguments: {1}'.format(ex.message, ex.args) return html def save_images(self, img_list, folder): """ (obj, list, str) -> None Saving images to the folders """ img_dir = self.work_dir + folder + '\\' self.helper.create_directory_ondemand(img_dir) for img in img_list: img_full_url = img['src'] if img['src'][:4] != 'http': img_full_url = self.main_url + img['src'] img_extention = img_full_url[img_full_url.rfind('.'):] image = self.download_document(img_full_url) if image is not None: gen_img_name = img_dir + self.helper.gen_file_name(extention='') self.helper.save_img_file(gen_img_name + img_extention, image) else: print ('[i] this image is not found: {0}'.format(img_full_url)) def process_biblio_oldb(self, doc): """ (obj, str) -> None Processing given html to extract information about publication rates. """ soup = BeautifulSoup(doc) gen_new_name = self.helper.gen_file_name(extention='') prettified_html = soup.prettify() text_from_html = soup.get_text() rows = soup.findAll('tr') section = '' dataset = {} # setting locale try: locale.setlocale(locale.LC_ALL, 'de_DE') except: locale.setlocale(locale.LC_ALL, 'deu_deu') #years = ['2010:', '2011:', '2012:', '2013:', '2014:'] years = ['2012:', '2013:', '2014:'] for row in rows: if row.attrs: section = row.text dataset[section] = {} else: cells = row.findAll('td') for cell in cells: if 'Gesamtpunkte' in cell.text: text = cell.text.replace(u'\xa0', u' ') splitted_ = text.split(u' ') filtered_ = filter(lambda a: a != u'', splitted_) filtered_ = filtered_[1:] name = '' def biblio_get_name(_input_list, years): """ (obj, list) -> str Building an proper name for the biblio link. """ name = '' for value in _input_list: if value in years: break; name = name + value + ' ' name_splitted = name.split(',') name_result = name_splitted[1].rstrip().lstrip() + ' ' + name_splitted[0].rstrip().lstrip() return name_result name = biblio_get_name(filtered_, years) dataset[name] = '' temp_dict = {} for index, value in enumerate(filtered_): if filtered_[index] in years: temp_dict[filtered_[index]] = filtered_[index + 1] dataset[name] = temp_dict final_dataset = {} for value in dataset: years_dict = {} for year in years: try: # years_dict[year[:len(year)-1]] = float(dataset[value][year]) years_dict[year[:len(year)-1]] = locale.atof(dataset[value][year]) except Exception as ex: #print ('[e] exception: {0}'.format(ex)) years_dict[year[:len(year)-1]] = float(0.0) final_dataset[value] = years_dict output_by_year = '' keyfunc = '' for year in years: sorted_ = sorted(final_dataset.keys(), key = lambda x: final_dataset[x][year[:len(year)-1]]) output_by_year = output_by_year + '-----------------' + year[:len(year)-1] + '-----------------------' + '\n' for value in sorted_: output_by_year = output_by_year + value + ' ' + str(final_dataset[value]) + '\n' DELIMETER = ',' output_as_csv = 'NAME' + DELIMETER + DELIMETER.join(years) + '\n' for row_key in final_dataset: tmp_row = row_key + DELIMETER for year in years: tmp_row += str(final_dataset[row_key][year[:-1]]) + DELIMETER output_as_csv += tmp_row[:-1] + '\n' # text data for debugging, uncomment if needed self.helper.save_file(self.work_dir + gen_new_name + '.html', prettified_html) self.helper.save_file(self.work_dir + gen_new_name + '.txt', text_from_html) self.helper.save_file(self.work_dir + os.path.sep + gen_new_name + '.csv', output_as_csv) self.helper.save_file(self.work_dir + os.path.sep + gen_new_name + '.output', output_by_year) def process_tutiempo_weather(self, tag): """ (obj) -> None Processing weather data of the city from the tutiempo. KRASNOYARSK: http://en.tutiempo.net/climate/ws-284935.html NOVOSIBIRSK: http://en.tutiempo.net/climate/ws-296340.html """ delimeter = ';' def internal_parser(html): """ (str) -> dict Internal parser that parses given html accoding to it's structure. """ soup = BeautifulSoup(html) data = [] table = soup.find('table', attrs={'class':'medias mensuales'}) rows = table.findAll('tr') for row in rows: cols = row.findAll('td') cols = [ele.text.strip() for ele in cols] data.append([ele for ele in cols if ele]) # Get rid of empty values daily_weather = {} for row in data: if len(row) > 3: try: day = int(row[0]) if day >= 1 and day <= 31: daily_weather[day] = [row[2], row[3]] except: pass #print '[e] exception: {}'.format(str(ex)) # sorting values to have them in order daily_weather = collections.OrderedDict(sorted(daily_weather.items())) return daily_weather #params for testing years = ['1999'] months = ['01'] #params for real processing #years = ['1999', '2000', '2001', '2002'] #months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'] try: if years is None or len(years) < 1: raise Exception('years should be defined') if months is None or len(months) < 1: raise Exception('months should be defined') except Exception as ex: print '[e] exception: {0}, arguments: {1}'.format(ex.message, ex.args) return result_csv = list() for year in years: for month in months: month_year = '{0}-{1}'.format(month,year) url_to_download = self.dynamic_url % (month_year) # priting info about url to download print '[i] url to download {}'.format(url_to_download) month_weather = internal_parser(self.download_document(url_to_download)) for key in month_weather: tmp_time = '{}-{}-{}'.format(str(year), str(month), str(key)) tmp_max = str(month_weather[key][0]) tmp_min = str(month_weather[key][1]) result_csv.append([tmp_time, tmp_max, tmp_min]) # setting the main script to sleep for some seconds sleep(SLEEP_TIME_IN_SECONDS) # generating new file name _new_file_name = str(uuid.uuid1()) # converting to the csv csv_output = '' # setting header of the csv prefix = tag[0:3] csv_output = prefix + 'Time' + delimeter + prefix + 'MaxTemp' + delimeter + prefix + 'MinTemp' + '\n' for row in result_csv: tmp = '' for value in row: tmp += value + delimeter csv_output += tmp[:-1*len(delimeter)] + '\n' # saving csv output to in the file csv_full_path = self.work_dir + os.path.sep + tag + '-' + _new_file_name + '.csv' self.helper.save_file(csv_full_path, csv_output) print '[i] data in csv saved to the {}'.format(csv_full_path) # dispatchers for extract and parsing one single web page # it actually shows statistics of last three years starting from given dispatcher = { 'http://diglib.bis.uni-oldenburg.de/hsb/statistik/?page=hsb_institut&jahr=2014&inst=20100': process_biblio_oldb, } # dispatchers for extract and parsing multiple web pages dispatcher_dynamic = { 'http://en.tutiempo.net/climate/%s/ws-284935.html': [process_tutiempo_weather, "KRASNOYARSK"], 'http://en.tutiempo.net/climate/%s/ws-296340.html': [process_tutiempo_weather, "NOVOSIBIRSK"], }
class BSCrawler(): UA = 'Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.9.2.9) Gecko/20100913 Firefox/3.6.9' def __init__(self, start_url, domain): """ initial method: - initiates helper class; - checks the temp directory existance """ self.helper = DirectoryHelper() #self.helper.prepare_working_directory() try: self.work_dir = self.helper.work_dir except: self.work_dir = '__temp__' if not os.path.exists(self.work_dir): os.makedirs(self.work_dir) if start_url is None: print '[e] specify start URL' start_url = BSCrawler.adjust_url(start_url) self.urls = SetQueue() self.urls.put(start_url) self.domain = domain self.website_graph = {} print '[i] files will be saved into folder "{0}"'.format(self.work_dir) def download_document(self, url): """ (obj,str) -> (str) Downloading html page and storing inside string. """ html = None try: req = urllib2.Request(url=url, headers={'User-Agent': self.UA}) hdl = urllib2.urlopen(req) content_type = hdl.info().dict['content-type'] if ACCEPTABLE_CONTENT_TYPE in content_type: html = hdl.read() else: print '[i] ignored content-type was {0}'.format(content_type) except Exception as ex: print '[e] exception: {0}, arguments: {1}'.format(ex.message, ex.args) return html @staticmethod def adjust_url(url): """ Removing the last slash if presented """ if url[-1:] == '/': url = url[:-1] return url def crawl(self): """ (obj) -> None Method that extracts urls from 'dispatcher' dictionary and process them. """ visited = set() while not self.urls.empty(): # iterating over queue url = self.urls.get() print '[i] parsing following url {0}'.format(url) visited.add(url) html = self.download_document(url) if html is None: continue soup = BeautifulSoup(html) # extracting additional links for line in soup.findAll('a'): url_potential = line.get('href') if url_potential != '#' and \ url_potential is not None: url_potential = BSCrawler.adjust_url(url_potential) self.update_website_graph(from_link=url, to_link=url_potential) if url_potential not in visited and \ self.domain in url_potential: self.urls.put(url_potential) #adding to the queue def proper_filename(url): html_file_name = url.replace('https://', '') html_file_name = html_file_name.replace('http://', '') html_file_name = html_file_name.replace(':', '') #h_file_name = h_file_name.replace('/', os.path.sep) + str(uuid.uuid1()) html_file_name = html_file_name.replace('/', '-') + str(uuid.uuid1()) return self.work_dir + os.path.sep + html_file_name + '.html' # saving html to a file full_file_name = proper_filename(url) self.helper.save_file(full_file_name, html) print '[i] html was saved to the {}'.format(full_file_name) def update_website_graph(self, from_link, to_link): """ (obj, str) -> None Updating graph of a web site """ if not from_link in self.website_graph: self.website_graph[from_link] = [] self.website_graph[from_link] += [to_link]
class WordCloudHelper(): """Class that is responsinble for data masks and fonts.""" def __init__(self, work_dir=None): """ (obj, str) -> None Initializing the class. """ # assert work_dir is not None, \ # "work_dir should not be None" self.dir_helper = DirectoryHelper(work_dir) self.save_dir = self.dir_helper.work_dir self.dir_helper.prepare_working_directory() print '[i] working directory prepared' def load_fonts(self, selected_fonts=None): """ (obj, list) -> dict Loading fonts as specified in the list or by itereting folder with fonts. """ BASE_FOLDER = self.dir_helper.upper_directory() + 'fonts/' fonts = {} if selected_fonts is not None: for font in selected_fonts: fonts[font] = BASE_FOLDER + font + '.ttf' else: files = [f for f in os.listdir(BASE_FOLDER)] for f in files: if (f[-4:].lower() in ('.ttf')): fonts[f[:-4]] = BASE_FOLDER + f return fonts def load_masks(self, selected_masks=None): """ (obj, list) -> dict Loading masks as specified in the list or by itereting folder with masks. """ BASE_FOLDER = self.dir_helper.upper_directory() + 'masks/' masks = {} if selected_masks is not None: for mask in selected_masks: masks[mask] = BASE_FOLDER + mask + '.png' else: files = [f for f in os.listdir(BASE_FOLDER)] for f in files: if (f[-4:].lower() in ('.png')): masks[f[:-4]] = BASE_FOLDER + f return masks