def get_processed_questions_and_answers_as_lists(self, ids):
     for X, y in self._get_data_labels(ids):
         while not X.empty:
             yield lower_text(process_html(
                 X['question_title'].iloc[0])).split()
             yield lower_text(process_html(
                 X['question_body'].iloc[0])).split()
             yield lower_text(process_html(X['body'].iloc[0])).split()
             X = X.iloc[1:]
    def transform_input(self, data):
        question_body = self.transform_sentence_batch_to_vector(
            [lower_text(process_html(X)) for X in data['question_body']],
            self.question_body_words_count)
        answer_body = self.transform_sentence_batch_to_vector(
            [lower_text(process_html(X)) for X in data['body']],
            self.answer_body_words_count)

        return [question_body, answer_body]
예제 #3
0
 def get_processed_texts_as_lists(self, ids):
     X = pd.DataFrame()
     for X0_i, y0_i, X1_i, y1_i in self._get_data_labels(ids):
         X = pd.concat([X, X0_i, X1_i])
         while not X.empty:
             yield lower_text(process_html(X['body'].iloc[0])).split()
             X = X.iloc[1:]
예제 #4
0
 def __iter__(self):
     data = api.load(self.filename)
     for article in data:
         for text in article['section_texts']:
             sentences = text.split('.')
             for sentence in sentences:
                 sentence = lower_text(sentence)
                 if len(sentence) >= WikiSentences.MIN_SENTENCE_LENGTH:
                     yield sentence.split()
예제 #5
0
def test_lengths(csv_path):
    data_reader = ImbalancedDataReader(csv_path, 'question_id')
    question_title_lengths = []
    question_body_lengths = []
    answer_body_lengths = []
    cnt = 0
    for X, y in data_reader.get_raw_data_labels_batch(data_reader.get_ids(),
                                                      50):
        cnt += 1
        if cnt % 10 == 0:
            print(cnt)
        for i in range(50):
            question_title_lengths.append(
                len(lower_text(process_html(X.iloc[i]['question_title']))))
            question_body_lengths.append(
                len(lower_text(process_html(X.iloc[i]['question_body']))))
            answer_body_lengths.append(
                len(lower_text(process_html(X.iloc[i]['body']))))
    draw_lens_histogram(question_body_lengths, 'вопрос', 'question_lengths')
    draw_lens_histogram(answer_body_lengths, 'ответ', 'answer_lengths')
    print(np.mean(question_body_lengths))
    print(np.mean(answer_body_lengths))
    print(np.median(question_body_lengths))
    print(np.median(answer_body_lengths))
예제 #6
0
import gensim.downloader as api

from utils.word_utils import lower_text

print('was')
data = api.load('wiki-english-20171001')
print('was')
article_count = 0
sentence_count = 0
word_count = 0
for article in data:
    article_count += 1
    if article_count % 100 == 0:
        print(article_count)
    for text in article['section_texts']:
        sentences = text.split('.')
        for sentence in sentences:
            sentence = lower_text(sentence)
            if len(sentence) >= 5:
                sentence_count += 1
                word_count += len(sentence.split())

print(article_count)
print(sentence_count)
print(word_count)
 def transform_input(self, data):
     answer_body = self.transform_sentence_batch_to_vector(
         [lower_text(process_html(X)) for X in data['body']],
         self.answer_body_words_count)
     other_features = self.get_other_features(data)
     return [answer_body, other_features]
 def process_train(self, fasttext_file_name, data_reader, ids, batch_size=50):
     with open(fasttext_file_name, 'w', encoding='utf-8') as fasttext_file:
         for X, y in data_reader.get_raw_data_labels_batch(set(ids), batch_size):
             for i in range(len(y)):
                 fasttext_file.write('__label__{} {}\n'.format(y[i], lower_text(process_html(X['body'].iloc[i]))))
 def get_processed_texts_as_lists(self, ids):
     for X, y in self._get_data_labels(ids):
         while not X.empty:
             yield lower_text(process_html(X['body'].iloc[0])).split()
             X = X.iloc[1:]