def _write_location_tocsv(self, datawriter: TextIO, cleaned_post_location: CleanedPost, panon_set=None) -> None: """Writes a single record of cleaned posts to CSV list - write intermediate cleaned post data to file for later use Arguments datawriter - open file file_handle to output file cleaned_post_location - cleaned post of type CleanedPost (namedtuple) panonymize - This will limit written item-lists (emoji, tags, body-content) to the terms that exist in identified toplists. The result is a pseudo- anonymized post that only contains the less identifiable popular terms that are used by many users. """ if panon_set: cleaned_post_location = self._panonymize_cleaned_post( cleaned_post_location, panon_set) ploc_list = PrepareData._cleaned_ploc_tolist(cleaned_post_location) datawriter.writerow(ploc_list)
def extract_info(link: str, csv_writer: TextIO) -> None: html = urlopen(link) bs_obj_jobs = BeautifulSoup(html, 'lxml') table = bs_obj_jobs.find_all('table') description = 'None' try: if len(table[2].table.find_all('tr')[1].td.td.text) == 10: # if date only: '03.08.2020' date_start = table[2].table.find_all('tr')[1].td.td.text else: # if this format: '03.08.2020, Ref.#: DE' get only date date_start = table[2].table.find_all('tr')[1].td.td.text.split( ',')[0] except Exception as ex: print(f'Ellement not found {ex}') try: # header info # address, working hours etc. header_info = table[2].table.find_all('tr')[1].find( 'td', style="font-style:italic;").get_text() # also contain GPS print(header_info) except Exception as ex: print(f'Ellement not found {ex}') try: title = table[2].table.find_all('tr')[3].b.text company_url = table[2].table.find_all('tr')[3].a['href'] print(f'Title: {title} and company url, {company_url}') except Exception as ex: print(f'Ellement not found {ex}') try: full_text_len = len(table[2].table.find_all('tr')[7].get_text()) if full_text_len < 700: description = table[2].table.find_all('tr')[6].get_text() all_text = 'true' if len(description) < 700: description = table[2].table all_text = 'true' elif full_text_len > 700: description = table[2].table.find_all('tr')[7].get_text() all_text = 'true' else: description = 'None' all_text = 'false' except Exception as ex: print(ex) try: keywords = keywords_f(description) except Exception as ex: keywords: str = 'None' print(Exception) # 'date_start', 'str_keywords', 'len_keywords', 'title', 'link' print(date_start, title, keywords, len(keywords), link, all_text) # no list of keywords data = date_start, title, keywords, len(keywords), link, all_text # data = date_start, link, company_name, title, description, company_url, all_text # Writing data in file csv_writer.writerow(data) # df.to_csv('my_csv.csv', mode='a', header=False) return None