Python clean_text примеры, common.clean_text Python примеры использования

Пример #1

0

Показать файл

def guess_sport(data, full_text):
    data_lc = clean_text(str(data)).lower()
    print(data_lc)
    kickboxing_refs1 = len(re.findall('kickbox', data_lc))
    boxing_refs1 = len(re.findall('[\W.,]+box', data_lc))
    mma_refs1 = len(re.findall('(mixed martial arts)|(mma)', data_lc))

    print(kickboxing_refs1, boxing_refs1, mma_refs1)

    if kickboxing_refs1 > boxing_refs1 and kickboxing_refs1 > mma_refs1:
        return kickboxing_key
    if mma_refs1 > boxing_refs1 and mma_refs1 > kickboxing_refs1:
        return mma_key
    if boxing_refs1 > mma_refs1 and boxing_refs1 > kickboxing_refs1:
        return boxing_key

    data_lc = clean_text(str(full_text)).lower()
    kickboxing_refs2 = len(re.findall('kickbox', data_lc))
    boxing_refs2 = len(re.findall('[\W.,]+box', data_lc))
    mma_refs2 = len(re.findall('(mixed martial arts)|(mma)', data_lc))
    print(kickboxing_refs2, boxing_refs2, mma_refs2)

    if kickboxing_refs2 > boxing_refs2 and kickboxing_refs2 > mma_refs2:
        return kickboxing_key
    if mma_refs2 > boxing_refs2 and mma_refs2 > kickboxing_refs2:
        return mma_key
    if boxing_refs2 > mma_refs2 and boxing_refs2 > kickboxing_refs2:
        return boxing_key

    return other_key

Пример #2

0

Показать файл

def main():

    check_or_create_save_folder(SAVE_FILE_PATH)

    # List all scraped files related to ApInfo
    for html_file_path in os.listdir(JOB_FOLDER):

        # Parse only HTML files
        if html_file_path.endswith(".html"):

            job_id = re.findall(r'\d+', html_file_path)[0]
            json_file_name = "%s-%s.json" % (job_platform, job_id)
            save_path = "%s/%s" % (SAVE_FILE_PATH, json_file_name)

            # Check if file hasn't already been parsed
            if not os.path.isfile(save_path):
                try:
                    htmlfile = open(JOB_FOLDER + "/" + html_file_path)
                    soup = BeautifulSoup(htmlfile.read())

                    job_info = soup.find("div",
                                         class_="info-data").text.strip()
                    date = job_info.rsplit('-', 1)[1].strip()
                    location = clean_text(job_info.rsplit('-', 1)[0])
                    city = location.rsplit('-', 1)[0].strip()
                    state = location.rsplit('-', 1)[1].strip()

                    job_title = soup.find("div", class_="cargo m-tb").text
                    job_title = clean_apinfo_jobtitle(job_title)
                    company = clean_text(
                        soup.find('div',
                                  class_="texto").contents[3].contents[2])
                    job_description = clean_text(
                        soup.find('div', class_="texto").contents[1].text)

                    data = {
                        'date': date,
                        'job_title': job_title,
                        'company': company,
                        'location_city': city,
                        'location_state': state,
                        'job_description': job_description,
                        'job_platform': job_platform,
                        'job_platform_id': job_id
                    }

                    save_json_file(save_path, data)

                # Log errors to a text file
                except Exception as e:
                    target = open(ERROR_LOG_FILE, "a")
                    error_details = ""
                    if job_id:
                        error_details += "%s: " % job_id
                        print(job_id)
                    print(e)
                    error_details += str(e)
                    target.write("%s\n" % error_details)

Пример #3

0

Показать файл

Файл: apinfoParser.py Проект: RyanOM/jobstat

def main():

    check_or_create_save_folder(SAVE_FILE_PATH)

    # List all scraped files related to ApInfo
    for html_file_path in os.listdir(JOB_FOLDER):

        # Parse only HTML files
        if html_file_path.endswith(".html"):

            job_id = re.findall(r'\d+', html_file_path)[0]
            json_file_name = "%s-%s.json" % (job_platform, job_id)
            save_path = "%s/%s" % (SAVE_FILE_PATH, json_file_name)

            # Check if file hasn't already been parsed
            if not os.path.isfile(save_path):
                try:
                    htmlfile = open(JOB_FOLDER+"/"+html_file_path)
                    soup = BeautifulSoup(htmlfile.read())

                    job_info = soup.find("div", class_="info-data").text.strip()
                    date = job_info.rsplit('-', 1)[1].strip()
                    location = clean_text(job_info.rsplit('-', 1)[0])
                    city = location.rsplit('-', 1)[0].strip()
                    state = location.rsplit('-', 1)[1].strip()

                    job_title = soup.find("div", class_="cargo m-tb").text
                    job_title = clean_apinfo_jobtitle(job_title)
                    company = clean_text(soup.find('div', class_="texto").contents[3].contents[2])
                    job_description = clean_text(soup.find('div', class_="texto").contents[1].text)

                    data = {
                        'date': date,
                        'job_title': job_title,
                        'company': company,
                        'location_city': city,
                        'location_state': state,
                        'job_description': job_description,
                        'job_platform': job_platform,
                        'job_platform_id': job_id
                    }

                    save_json_file(save_path, data)

                # Log errors to a text file
                except Exception as e:
                    target = open(ERROR_LOG_FILE, "a")
                    error_details = ""
                    if job_id:
                        error_details += "%s: " % job_id
                        print(job_id)
                    print(e)
                    error_details += str(e)
                    target.write("%s\n" % error_details)

Пример #4

0

Показать файл

Файл: process_sherdog_data.py Проект: tristan00/sports_predictor

def extract_general_method(s):
    split_method = str(s).split('(')
    if len(split_method) >= 2:
        clean_method = clean_text(split_method[0])
    else:
        clean_method = clean_text(s)

    if 'tko' in clean_method or 'ko' in clean_method:
        return 'ko'
    elif 'submission' in clean_method:
        return 'submission'
    elif 'decision' in clean_method:
        return 'decision'
    else:
        return 'other'

Пример #5

0

Показать файл

def main():

    check_or_create_save_folder(SAVE_FILE_PATH)

    # List all scraped files related to Trampos
    for json_file_path in os.listdir(JOB_FOLDER):

        # Parse only JSON files
        if json_file_path.endswith(".json"):

            job_id = re.findall(r'\d+', json_file_path)[0]
            json_file_name = "%s-%s.json" % (job_platform, job_id)
            save_path = "%s/%s" % (SAVE_FILE_PATH, json_file_name)

            # Check if file hasn't already been parsed
            if not os.path.isfile(save_path):
                try:
                    with open("%s/%s" %
                              (JOB_FOLDER, json_file_path)) as json_data:
                        job_data = json.load(json_data)['opportunity']
                        data = {}

                        data['date'] = get_date(job_data['published_at'])

                        if 'city' in job_data and 'state' in job_data:
                            data['city'] = clean_text(job_data['city'])
                            data['state'] = clean_text(job_data['state'])
                        elif 'home_office' in job_data:
                            data['home_office'] = True

                        data['job_title'] = clean_text(job_data['name'])

                        if 'company' in job_data and job_data['company']:
                            data['company'] = clean_text(
                                job_data['company']['name'])

                        data['job_description'] = "%s %s %s" % (
                            clean_text(job_data['description']),
                            clean_text(job_data['prerequisite']),
                            clean_text(job_data['desirable']))

                        data['job_platform'] = job_platform
                        data['job_platform_id'] = job_id

                        save_json_file(save_path, data)

                # Log errors to a text file
                except Exception as e:
                    target = open(ERROR_LOG_FILE, "a")
                    error_details = ""
                    if job_id:
                        error_details += "%s: " % job_id
                        print(job_id)
                    print(e)
                    error_details += str(e)
                    target.write("%s\n" % error_details)

Пример #6

0

Показать файл

Файл: text_methods.py Проект: tristan00/Website_analysis

    def transform(self, documents):

        documents = [tokenize(d) for d in documents]
        documents = [d[:self.max_page_size] for d in documents]
        documents = [' '.join(d) for d in documents]

        if self.encoding_type in ['tfidf', 'count', 'binary']:
            return self.vectorizer.transform(documents).toarray()
        if self.encoding_type == 'lda':
            documents_tokenized = [tokenize(i) for i in documents]
            other_corpus = [
                self.common_dictionary.doc2bow(i) for i in documents_tokenized
            ]
            results = []
            for i in other_corpus:
                result = self.vectorizer[i]
                result = vectorize_topic_models(result, self.num_of_topics)
                results.append(result)

            return np.array(results)
        if self.encoding_type in ['doc2vec']:
            documents_tokenized = [tokenize(i) for i in documents]

            results = []
            for i in documents_tokenized:
                if i:
                    try:
                        results.append(self.vectorizer[i][0])
                    except KeyError:
                        results.append([0 for _ in range(self.encoding_size)])
                else:
                    results.append([0 for _ in range(self.encoding_size)])

            return np.array(results)

        if self.encoding_type in ['fasttext']:
            documents_clean = [clean_text(i) for i in documents]

            results = []
            for i in documents_clean:
                if i:
                    results.append(self.vectorizer.get_sentence_vector(i))
                    # results.append(self.vectorizer[i])
                else:
                    results.append(
                        np.array([0 for _ in range(self.encoding_size)]))

            return np.array(results)

Пример #7

0

Показать файл

def scrape_fighter(next_url):
    sections_dict = dict()

    r = requests.get(next_url)
    soup = BeautifulSoup(r.text)
    stats_table_card = soup.find('table', {'class': 'infobox vcard'})
    general_stats = get_general_info(stats_table_card)

    mw_parser_output = soup.find('div', {'class': 'mw-parser-output'})
    if mw_parser_output:
        page_items = mw_parser_output.find_all(['h2', 'h3', 'table'])

        sections = dict()
        active_key = None
        for i in page_items:
            if i.name == 'h2':
                active_key = clean_text(i.get_text()).lower()
                sections[active_key] = []
            if active_key:
                sections[active_key].append(i)

        for i in sections:
            if 'mma' in i.lower() or 'boxing' in i.lower() or 'record' in i.lower():
                if i not in mma_page_section_names and i not in boxing_page_section_names and i not in ['Possibly missed key: Amateur kickboxing career[edit]', 'Possibly missed key: Professional boxing career[edit]', 'Possibly missed key: Mixed martial arts career[edit]', 'Possibly missed key: Kickboxing record (Incomplete)[edit]']:
                    print('Possibly missed key: {0}'.format(i))

        active_key2 = None
        for c1, i in enumerate(sections):

            if i in mma_page_section_names:
                sections_dict.setdefault(mma_key, dict())
                sections_dict[mma_key].setdefault(exhibition_key, list())
                sections_dict[mma_key].setdefault(pro_key, list())
                sections_dict[mma_key].setdefault(amateur_key, list())
                active_key1 = mma_key
            elif i in boxing_page_section_names:
                sections_dict.setdefault(boxing_key, dict())
                sections_dict[boxing_key].setdefault(exhibition_key, list())
                sections_dict[boxing_key].setdefault(pro_key, list())
                sections_dict[boxing_key].setdefault(amateur_key, list())
                active_key1 = boxing_key
            # elif i in kickboxing_page_section_names:
            #     sections_dict.setdefault(kickboxing_key, dict())
            #     sections_dict[kickboxing_key].setdefault(exhibition_key, list())
            #     sections_dict[kickboxing_key].setdefault(pro_key, list())
            #     sections_dict[kickboxing_key].setdefault(amateur_key, list())
            #     active_key1 = kickboxing_key
            elif i in mma_amateur_section:
                sections_dict.setdefault(mma_key, dict())
                sections_dict[mma_key].setdefault(amateur_key, list())
                active_key1 = mma_key
                active_key2 = amateur_key
            elif i in mma_amateur_section:
                sections_dict.setdefault(mma_key, dict())
                sections_dict[mma_key].setdefault(amateur_key, list())
                active_key1 = mma_key
                active_key2 = amateur_key
            elif i in mma_pro_section:
                sections_dict.setdefault(mma_key, dict())
                sections_dict[mma_key].setdefault(pro_key, list())
                active_key1 = mma_key
                active_key2 = pro_key
            elif i in professional_boxing_section_names:
                sections_dict.setdefault(boxing_key, dict())
                sections_dict[boxing_key].setdefault(pro_key, list())
                active_key1 = boxing_key
                active_key2 = pro_key
            elif i in professional_record_names:
                sport_type = guess_sport(stats_table_card, r.text)
                print('guessing sport: {0} {1}'.format(next_url, sport_type))
                active_key1 = sport_type
                sections_dict.setdefault(sport_type, dict())
                sections_dict[sport_type].setdefault(amateur_key, list())
                sections_dict[sport_type].setdefault(pro_key, list())
                sections_dict[sport_type].setdefault(amateur_key, list())
            else:
                continue

            for c2, j in enumerate(sections[i]):
                data_type = get_table_type(j)
                # print(data_type)
                if data_type == other_key:
                    continue
                if data_type == data_table_key and c2 < 2:
                    active_key2 = pro_key
                if data_type in [pro_key, amateur_key, exhibition_key]:
                    active_key2 = data_type
                if data_type == data_table_key and active_key2:
                    sections_dict[active_key1][active_key2].append(j)
            active_key2 = None

    return sections_dict, general_stats

Пример #8

0

Показать файл

def extract_table(f_url, tables_dict, general_stats):
    dfs = []
    new_urls = []

    fighter_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f_url))

    sport_keys = tables_dict.keys()
    for s in sport_keys:
        types_of_events = tables_dict[s].keys()
        for t in types_of_events:
            print(s, t)
            for j in tables_dict[s][t]:
                print(1)
                tr_tags = j.find_all('tr')
                header_index = get_row_num_of_headers(j)

                opponent_col_name = [i.get_text().strip() for i in tr_tags[header_index].find_all(['th', 'td']) if 'opponent' in i.get_text().strip().lower()][0]
                index_of_opponent = [c for c, i in enumerate(tr_tags[header_index].find_all(['th', 'td'])) if opponent_col_name.strip() == i.get_text().strip()][0]


                id_mapping = dict()
                for k in tr_tags[header_index + 1:]:

                    opponent_cell = k.find_all('td')[index_of_opponent]
                    opponent_a_tag = opponent_cell.find_all('a')
                    opponent_name = opponent_cell.get_text().strip()

                    opponent_rel_links = [k2['href'] for k2 in opponent_a_tag if k2['href'] not in links_to_avoid]

                    if opponent_rel_links:
                        opponent_rel_link = opponent_rel_links[-1]
                        opponent_abs_link = urljoin(base_url, opponent_rel_link)
                        new_urls.append(opponent_abs_link)
                        opponent_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, opponent_abs_link))
                        has_wiki = 1

                    else:
                        opponent_id = str(uuid.uuid4())
                        has_wiki = 0
                    id_mapping[opponent_name] = {'id': opponent_id, 'has_wiki': has_wiki}


                df = pd.read_html(str(j), header=header_index)[0]
                df['fighter_id'] = fighter_id


                df['opponent_id'] = df.apply(lambda x: id_mapping.get(x[opponent_col_name], {'id':str(uuid.uuid4())})['id'], axis = 1)
                df['opponent_has_wiki'] = df.apply(lambda x: id_mapping.get(x[opponent_col_name], {'has_wiki':0})['has_wiki'], axis = 1)

                df['sport'] = s
                df['event_type'] = t

                for g in general_stats:
                    df[g] = general_stats[g]

                df = df.applymap(lambda x: clean_text(x))
                dfs.append(df)
    if dfs:
        df = pd.concat(dfs)
    else:
        df = pd.DataFrame()

    return df, new_urls

Пример #9

0

Показать файл

def main():

    check_or_create_save_folder(SAVE_FILE_PATH)

    # List all scraped files related to Ceviu
    for html_file_path in os.listdir(JOB_FOLDER):

        # Parse only HTML files
        if html_file_path.endswith(".html"):

            job_id = re.findall(r'\d+', html_file_path)[0]
            json_file_name = "%s-%s.json" % (job_platform, job_id)
            save_path = "%s/%s" % (SAVE_FILE_PATH, json_file_name)

            # Check if file hasn't already been parsed
            if not os.path.isfile(save_path):
                try:
                    htmlfile = open(JOB_FOLDER + "/" + html_file_path)
                    soup = BeautifulSoup(htmlfile.read())

                    job_info = soup.find(
                        "p", class_="codigo-data-vaga").text.strip()
                    date = job_info.rsplit('Data: ', 1)[1].strip()

                    location = soup.find('div', class_="localizacao-vaga").text
                    location = clean_text(location)
                    city = re.search('Localizacao (.*)/', location,
                                     re.IGNORECASE).group(1)
                    state = re.search('/(.*)', location,
                                      re.IGNORECASE).group(1)

                    job_title = soup.find("h2", class_="titulo-vaga").text
                    job_title = clean_text(job_title)

                    company = None
                    if soup.find('a', class_="nome-empresa"):
                        company = soup.find('a', class_="nome-empresa").text
                        company = clean_text(company)

                    job_description = soup.find('div',
                                                class_='descricao-vaga').text
                    job_description = clean_text(job_description)
                    job_description = re.sub("Descricao da vaga ", "",
                                             job_description)
                    job_description = re.sub("Vaga Patrocinada ", "",
                                             job_description)

                    data = {
                        'date': date,
                        'job_title': job_title,
                        'location_city': city,
                        'location_state': state,
                        'job_description': job_description,
                        'job_platform': job_platform,
                        'job_platform_id': job_id
                    }

                    if company:
                        data['company'] = company

                    save_json_file(save_path, data)

                # Log errors to a text file
                except Exception as e:
                    target = open(ERROR_LOG_FILE, "a")
                    error_details = ""
                    if job_id:
                        error_details += "%s: " % job_id
                        print(job_id)
                    print(e)
                    error_details += str(e)
                    target.write("%s\n" % error_details)

Пример #10

0

Показать файл

Файл: process_sherdog_data.py Проект: tristan00/sports_predictor

def clean_name(s):
    s_split = str(s).split('"')
    if len(s_split) >= 3:
        return clean_text(s_split[0] + s_split[-1])
    return clean_text(s)

Пример #11

0

Показать файл

Файл: process_sherdog_data.py Проект: tristan00/sports_predictor

def extract_details(s):
    split_method = str(s).split('(')
    if len(split_method) >= 2:
        clean_method = clean_text(' '.join([i for i in split_method[1:]]))
        return clean_method

Пример #12

0

Показать файл

Файл: process_sherdog_data.py Проект: tristan00/sports_predictor

def extract_method(s):
    split_method = str(s).split('(')
    if len(split_method) >= 2:
        clean_method = clean_text(split_method[0])
        return clean_method

Пример #13

0

Показать файл

Файл: ceviuParser.py Проект: RyanOM/jobstat

def main():

    check_or_create_save_folder(SAVE_FILE_PATH)

    # List all scraped files related to Ceviu
    for html_file_path in os.listdir(JOB_FOLDER):

        # Parse only HTML files
        if html_file_path.endswith(".html"):

            job_id = re.findall(r'\d+', html_file_path)[0]
            json_file_name = "%s-%s.json" % (job_platform, job_id)
            save_path = "%s/%s" % (SAVE_FILE_PATH, json_file_name)

            # Check if file hasn't already been parsed
            if not os.path.isfile(save_path):
                try:
                    htmlfile = open(JOB_FOLDER+"/"+html_file_path)
                    soup = BeautifulSoup(htmlfile.read())

                    job_info = soup.find("p", class_="codigo-data-vaga").text.strip()
                    date = job_info.rsplit('Data: ', 1)[1].strip()

                    location = soup.find('div', class_="localizacao-vaga").text
                    location = clean_text(location)
                    city = re.search('Localizacao (.*)/', location, re.IGNORECASE).group(1)
                    state = re.search('/(.*)', location, re.IGNORECASE).group(1)

                    job_title = soup.find("h2", class_="titulo-vaga").text
                    job_title = clean_text(job_title)

                    company = None
                    if soup.find('a', class_="nome-empresa"):
                        company = soup.find('a', class_="nome-empresa").text
                        company = clean_text(company)

                    job_description = soup.find('div', class_='descricao-vaga').text
                    job_description = clean_text(job_description)
                    job_description = re.sub("Descricao da vaga ", "", job_description)
                    job_description = re.sub("Vaga Patrocinada ", "", job_description)


                    data = {
                        'date': date,
                        'job_title': job_title,
                        'location_city': city,
                        'location_state': state,
                        'job_description': job_description,
                        'job_platform': job_platform,
                        'job_platform_id': job_id
                    }

                    if company:
                        data['company'] = company

                    save_json_file(save_path, data)

                # Log errors to a text file
                except Exception as e:
                    target = open(ERROR_LOG_FILE, "a")
                    error_details = ""
                    if job_id:
                        error_details += "%s: " % job_id
                        print(job_id)
                    print(e)
                    error_details += str(e)
                    target.write("%s\n" % error_details)

Пример #14

0

Показать файл

def main():

    check_or_create_save_folder(SAVE_FILE_PATH)

    # List all scraped files related to NetCarreiras
    for html_file_path in os.listdir(JOB_FOLDER):

        # Parse only HTML files
        if html_file_path.endswith(".html"):

            job_id = re.findall(r'\d+', html_file_path)[0]
            json_file_name = "%s-%s.json" % (job_platform, job_id)
            save_path = "%s/%s" % (SAVE_FILE_PATH, json_file_name)

            # Check if file hasn't already been parsed
            if not os.path.isfile(save_path):
                try:
                    htmlfile = open("%s/%s" % (JOB_FOLDER, html_file_path))
                    soup = BeautifulSoup(htmlfile.read())

                    date = soup.find('div', class_="profile").contents[3].text
                    date = clean_text(date)

                    location = soup.find('div', {'id': "location"}).text
                    location = clean_text(location)
                    city = re.search('(.*) -', location, re.IGNORECASE).group(1)
                    state = re.search('- (.*) \(', location, re.IGNORECASE).group(1)

                    job_title = soup.find("h1").text
                    job_title = clean_text(job_title)

                    company = None
                    if soup.find_all('a', href=re.compile('^vagas-na-(.*)')):
                        company = soup.find_all('a', href=re.compile('^vagas-na-(.*)'))[0].text
                        company = clean_text(company)

                    job_description = soup.find('article').contents[11].text
                    job_description = clean_text(job_description)

                    data = {
                        'date': date,
                        'job_title': job_title,
                        'location_city': city,
                        'location_state': state,
                        'job_description': job_description,
                        'job_platform': job_platform,
                        'job_platform_id': job_id
                    }

                    if company:
                        data['company'] = company

                    save_json_file(save_path, data)

                # Log errors to a text file
                except Exception as e:
                    target = open(ERROR_LOG_FILE, "a")
                    error_details = ""
                    if job_id:
                        error_details += "%s: " % job_id
                        print(job_id)
                    print(e)
                    error_details += str(e)
                    target.write("%s\n" % error_details)

Пример #15

0

Показать файл

Файл: text_methods.py Проект: tristan00/Website_analysis

    def fit(self, documents):
        documents = [tokenize(d) for d in documents]
        documents = [d[:self.max_page_size] for d in documents]
        documents = [' '.join(d) for d in documents]

        if self.encoding_type in ['tfidf', 'count', 'binary']:

            if self.encoding_type == 'tfidf':
                self.vectorizer = CountVectorizer(
                    ngram_range=(self.min_n_gram, self.max_n_gram),
                    max_features=self.max_vocab_size,
                    binary=False,
                    max_df=self.max_df,
                    analyzer=self.tokenizer_level)
                self.vectorizer.fit(documents)
            if self.encoding_type == 'count':
                self.vectorizer = CountVectorizer(
                    ngram_range=(self.min_n_gram, self.max_n_gram),
                    max_features=self.max_vocab_size,
                    binary=False,
                    max_df=self.max_df,
                    analyzer=self.tokenizer_level)
                self.vectorizer.fit(documents)
            if self.encoding_type == 'binary':
                self.vectorizer = CountVectorizer(
                    ngram_range=(self.min_n_gram, self.max_n_gram),
                    max_features=self.max_vocab_size,
                    binary=False,
                    max_df=self.max_df,
                    analyzer=self.tokenizer_level)
                self.vectorizer.fit(documents)
            with open(self.save_file_loc, 'wb') as f:
                pickle.dump(self.vectorizer, f)
        if self.encoding_type == 'lda':
            documents_tokenized = [tokenize(i) for i in documents]
            self.common_dictionary = Dictionary(documents_tokenized)
            common_corpus = [
                self.common_dictionary.doc2bow(text)
                for text in documents_tokenized
            ]
            self.vectorizer = ldamodel.LdaModel(common_corpus,
                                                id2word=self.common_dictionary,
                                                num_topics=self.num_of_topics,
                                                passes=self.vectorizer_epochs)
            self.vectorizer.save(self.save_file_loc)
        if self.encoding_type == 'doc2vec':
            tagged_documents = [
                TaggedDocument(tokenize(doc), [i])
                for i, doc in enumerate(documents)
            ]
            self.vectorizer = Doc2Vec(tagged_documents,
                                      vector_size=self.encoding_size,
                                      window=2,
                                      min_count=1,
                                      workers=4,
                                      epochs=self.vectorizer_epochs,
                                      max_vocab_size=100000)
            self.vectorizer.delete_temporary_training_data(
                keep_doctags_vectors=True, keep_inference=True)
            self.vectorizer.save(self.save_file_loc)
        if self.encoding_type == 'fasttext':
            with open(self.fasttext_training_file_location, 'w') as f:
                for i in documents:
                    f.write(clean_text(i) + '\n')
            self.vectorizer = fasttext.train_unsupervised(
                self.fasttext_training_file_location,
                model=self.fasttext_algorithm,
                dim=self.encoding_size)
            self.vectorizer.save_model(self.save_file_loc)

Python clean_text примеры использования