Python lower_text 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: utils.word_utils

메소드/함수: lower_text

hotexamples.com에서의 예제들: 9

Python lower_text - 9개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 utils.word_utils.lower_text에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: imbalanced_data_reader.py 프로젝트: Nikitosh/StackOverflow-Correct-Answer-Detection

 def get_processed_questions_and_answers_as_lists(self, ids):
     for X, y in self._get_data_labels(ids):
         while not X.empty:
             yield lower_text(process_html(
                 X['question_title'].iloc[0])).split()
             yield lower_text(process_html(
                 X['question_body'].iloc[0])).split()
             yield lower_text(process_html(X['body'].iloc[0])).split()
             X = X.iloc[1:]

예제 #2

파일 보기

파일: cnn_only_classifier.py 프로젝트: Nikitosh/StackOverflow-Correct-Answer-Detection

    def transform_input(self, data):
        question_body = self.transform_sentence_batch_to_vector(
            [lower_text(process_html(X)) for X in data['question_body']],
            self.question_body_words_count)
        answer_body = self.transform_sentence_batch_to_vector(
            [lower_text(process_html(X)) for X in data['body']],
            self.answer_body_words_count)

        return [question_body, answer_body]

예제 #3

파일 보기

 def get_processed_texts_as_lists(self, ids):
     X = pd.DataFrame()
     for X0_i, y0_i, X1_i, y1_i in self._get_data_labels(ids):
         X = pd.concat([X, X0_i, X1_i])
         while not X.empty:
             yield lower_text(process_html(X['body'].iloc[0])).split()
             X = X.iloc[1:]

예제 #4

파일 보기

 def __iter__(self):
     data = api.load(self.filename)
     for article in data:
         for text in article['section_texts']:
             sentences = text.split('.')
             for sentence in sentences:
                 sentence = lower_text(sentence)
                 if len(sentence) >= WikiSentences.MIN_SENTENCE_LENGTH:
                     yield sentence.split()

예제 #5

파일 보기

def test_lengths(csv_path):
    data_reader = ImbalancedDataReader(csv_path, 'question_id')
    question_title_lengths = []
    question_body_lengths = []
    answer_body_lengths = []
    cnt = 0
    for X, y in data_reader.get_raw_data_labels_batch(data_reader.get_ids(),
                                                      50):
        cnt += 1
        if cnt % 10 == 0:
            print(cnt)
        for i in range(50):
            question_title_lengths.append(
                len(lower_text(process_html(X.iloc[i]['question_title']))))
            question_body_lengths.append(
                len(lower_text(process_html(X.iloc[i]['question_body']))))
            answer_body_lengths.append(
                len(lower_text(process_html(X.iloc[i]['body']))))
    draw_lens_histogram(question_body_lengths, 'вопрос', 'question_lengths')
    draw_lens_histogram(answer_body_lengths, 'ответ', 'answer_lengths')
    print(np.mean(question_body_lengths))
    print(np.mean(answer_body_lengths))
    print(np.median(question_body_lengths))
    print(np.median(answer_body_lengths))

예제 #6

파일 보기

import gensim.downloader as api

from utils.word_utils import lower_text

print('was')
data = api.load('wiki-english-20171001')
print('was')
article_count = 0
sentence_count = 0
word_count = 0
for article in data:
    article_count += 1
    if article_count % 100 == 0:
        print(article_count)
    for text in article['section_texts']:
        sentences = text.split('.')
        for sentence in sentences:
            sentence = lower_text(sentence)
            if len(sentence) >= 5:
                sentence_count += 1
                word_count += len(sentence.split())

print(article_count)
print(sentence_count)
print(word_count)

예제 #7

파일 보기

파일: rnn_classifier.py 프로젝트: Nikitosh/StackOverflow-Correct-Answer-Detection

 def transform_input(self, data):
     answer_body = self.transform_sentence_batch_to_vector(
         [lower_text(process_html(X)) for X in data['body']],
         self.answer_body_words_count)
     other_features = self.get_other_features(data)
     return [answer_body, other_features]

예제 #8

파일 보기

파일: csv_to_fasttext_preprocessor.py 프로젝트: Nikitosh/StackOverflow-Correct-Answer-Detection

 def process_train(self, fasttext_file_name, data_reader, ids, batch_size=50):
     with open(fasttext_file_name, 'w', encoding='utf-8') as fasttext_file:
         for X, y in data_reader.get_raw_data_labels_batch(set(ids), batch_size):
             for i in range(len(y)):
                 fasttext_file.write('__label__{} {}\n'.format(y[i], lower_text(process_html(X['body'].iloc[i]))))

예제 #9

파일 보기

파일: imbalanced_data_reader.py 프로젝트: Nikitosh/StackOverflow-Correct-Answer-Detection

 def get_processed_texts_as_lists(self, ids):
     for X, y in self._get_data_labels(ids):
         while not X.empty:
             yield lower_text(process_html(X['body'].iloc[0])).split()
             X = X.iloc[1:]