def get_processed_questions_and_answers_as_lists(self, ids): for X, y in self._get_data_labels(ids): while not X.empty: yield lower_text(process_html( X['question_title'].iloc[0])).split() yield lower_text(process_html( X['question_body'].iloc[0])).split() yield lower_text(process_html(X['body'].iloc[0])).split() X = X.iloc[1:]
def transform_input(self, data): question_body = self.transform_sentence_batch_to_vector( [lower_text(process_html(X)) for X in data['question_body']], self.question_body_words_count) answer_body = self.transform_sentence_batch_to_vector( [lower_text(process_html(X)) for X in data['body']], self.answer_body_words_count) return [question_body, answer_body]
def get_processed_texts_as_lists(self, ids): X = pd.DataFrame() for X0_i, y0_i, X1_i, y1_i in self._get_data_labels(ids): X = pd.concat([X, X0_i, X1_i]) while not X.empty: yield lower_text(process_html(X['body'].iloc[0])).split() X = X.iloc[1:]
def __iter__(self): data = api.load(self.filename) for article in data: for text in article['section_texts']: sentences = text.split('.') for sentence in sentences: sentence = lower_text(sentence) if len(sentence) >= WikiSentences.MIN_SENTENCE_LENGTH: yield sentence.split()
def test_lengths(csv_path): data_reader = ImbalancedDataReader(csv_path, 'question_id') question_title_lengths = [] question_body_lengths = [] answer_body_lengths = [] cnt = 0 for X, y in data_reader.get_raw_data_labels_batch(data_reader.get_ids(), 50): cnt += 1 if cnt % 10 == 0: print(cnt) for i in range(50): question_title_lengths.append( len(lower_text(process_html(X.iloc[i]['question_title'])))) question_body_lengths.append( len(lower_text(process_html(X.iloc[i]['question_body'])))) answer_body_lengths.append( len(lower_text(process_html(X.iloc[i]['body'])))) draw_lens_histogram(question_body_lengths, 'вопрос', 'question_lengths') draw_lens_histogram(answer_body_lengths, 'ответ', 'answer_lengths') print(np.mean(question_body_lengths)) print(np.mean(answer_body_lengths)) print(np.median(question_body_lengths)) print(np.median(answer_body_lengths))
import gensim.downloader as api from utils.word_utils import lower_text print('was') data = api.load('wiki-english-20171001') print('was') article_count = 0 sentence_count = 0 word_count = 0 for article in data: article_count += 1 if article_count % 100 == 0: print(article_count) for text in article['section_texts']: sentences = text.split('.') for sentence in sentences: sentence = lower_text(sentence) if len(sentence) >= 5: sentence_count += 1 word_count += len(sentence.split()) print(article_count) print(sentence_count) print(word_count)
def transform_input(self, data): answer_body = self.transform_sentence_batch_to_vector( [lower_text(process_html(X)) for X in data['body']], self.answer_body_words_count) other_features = self.get_other_features(data) return [answer_body, other_features]
def process_train(self, fasttext_file_name, data_reader, ids, batch_size=50): with open(fasttext_file_name, 'w', encoding='utf-8') as fasttext_file: for X, y in data_reader.get_raw_data_labels_batch(set(ids), batch_size): for i in range(len(y)): fasttext_file.write('__label__{} {}\n'.format(y[i], lower_text(process_html(X['body'].iloc[i]))))
def get_processed_texts_as_lists(self, ids): for X, y in self._get_data_labels(ids): while not X.empty: yield lower_text(process_html(X['body'].iloc[0])).split() X = X.iloc[1:]