Exemplo n.º 1
0
    def _write_location_tocsv(self,
                              datawriter: TextIO,
                              cleaned_post_location: CleanedPost,
                              panon_set=None) -> None:
        """Writes a single record of cleaned posts to CSV list

        - write intermediate cleaned post data to file for later use
        Arguments
        datawriter              -      open file file_handle to
                                       output file
        cleaned_post_location   -      cleaned post of type CleanedPost
                                       (namedtuple)
        panonymize              -      This will limit written item-lists
                                       (emoji, tags, body-content) to
                                       the terms that exist in identified
                                       toplists. The result is a pseudo-
                                       anonymized post that only contains
                                       the less identifiable popular terms
                                       that are used by many users.
        """
        if panon_set:
            cleaned_post_location = self._panonymize_cleaned_post(
                cleaned_post_location, panon_set)
        ploc_list = PrepareData._cleaned_ploc_tolist(cleaned_post_location)
        datawriter.writerow(ploc_list)
Exemplo n.º 2
0
def extract_info(link: str, csv_writer: TextIO) -> None:
    html = urlopen(link)
    bs_obj_jobs = BeautifulSoup(html, 'lxml')
    table = bs_obj_jobs.find_all('table')
    description = 'None'
    try:
        if len(table[2].table.find_all('tr')[1].td.td.text) == 10:
            # if date only: '03.08.2020'
            date_start = table[2].table.find_all('tr')[1].td.td.text
        else:
            # if this format: '03.08.2020, Ref.#: DE' get only date
            date_start = table[2].table.find_all('tr')[1].td.td.text.split(
                ',')[0]
    except Exception as ex:
        print(f'Ellement not found {ex}')

    try:  # header info # address, working hours etc.
        header_info = table[2].table.find_all('tr')[1].find(
            'td', style="font-style:italic;").get_text()  # also contain GPS
        print(header_info)
    except Exception as ex:
        print(f'Ellement not found {ex}')

    try:
        title = table[2].table.find_all('tr')[3].b.text
        company_url = table[2].table.find_all('tr')[3].a['href']
        print(f'Title: {title} and company url, {company_url}')
    except Exception as ex:
        print(f'Ellement not found {ex}')

    try:
        full_text_len = len(table[2].table.find_all('tr')[7].get_text())
        if full_text_len < 700:
            description = table[2].table.find_all('tr')[6].get_text()
            all_text = 'true'
            if len(description) < 700:
                description = table[2].table
                all_text = 'true'
        elif full_text_len > 700:
            description = table[2].table.find_all('tr')[7].get_text()
            all_text = 'true'
        else:
            description = 'None'
            all_text = 'false'
    except Exception as ex:
        print(ex)

    try:
        keywords = keywords_f(description)
    except Exception as ex:
        keywords: str = 'None'
        print(Exception)
        # 'date_start', 'str_keywords', 'len_keywords', 'title', 'link'
    print(date_start, title, keywords, len(keywords), link,
          all_text)  # no list of keywords
    data = date_start, title, keywords, len(keywords), link, all_text
    # data = date_start, link, company_name, title, description, company_url, all_text
    # Writing data in file
    csv_writer.writerow(data)
    # df.to_csv('my_csv.csv', mode='a', header=False)
    return None