Exemplo n.º 1
0
def extract_phrases(filepath):
    with open(filepath, 'r') as file:
        file_read_iterator = file.readlines()
        logging.info('Initializing for roller coaster ride')
        overall_top_phrases_dict = dict()
        for batch_lines in split_every(size=10000,
                                       iterable=tqdm(file_read_iterator,
                                                     unit='line processed')):
            logging.info('Length of line being processed:{}'.format(
                len(batch_lines)))
            logging.debug(
                'Length of single-line in batch  being processed:{}'.format(
                    len(batch_lines[0])))
            lines_list = [
                StringCleaner.clean(line).rstrip('\n')
                for line in tqdm(batch_lines)
            ]
            text = ' '.join(lines_list)
            logging.debug('Processing text:{}..'.format(text[:100]))
            batch_top_phrases_dict = dict(frequent_phrases(text, top_k=100))
            update_top_phrase_dict(overall_top_phrases_dict,
                                   batch_top_phrases_dict)
            logging.debug('Got total {} frequent phrases.'.format(
                len(batch_top_phrases_dict)))
            logging.debug('Frequent phrases in batch:%s ...',
                          list(batch_top_phrases_dict.keys())[:5])
            overall_top_phrases_dict = update_top_phrase_dict(
                overall_top_phrases_dict, batch_top_phrases_dict)
        return overall_top_phrases_dict
    def test__unicode_normalizer(self):
        """
        Should private method be tested ?
        Returns:

        """
        unicode_normalized_string = StringCleaner._unicode_normalizer(
            self.unicode_string)
        exptected_unicode_normalized_string = 'a  food was camc amazing!! . the place was  good.'
        self.assertEqual(unicode_normalized_string,
                         exptected_unicode_normalized_string)
 def test_clean_word_empty(self):
     self.assertEqual(StringCleaner.remove_multiple_whitespace(' '), '')
 def test_clean_word_multi_word_with_spaces_between(self):
     self.assertEqual(
         StringCleaner.remove_multiple_whitespace(' Brown  Beer '),
         'brown beer')
 def test_clean_word_upper_and_spaces_and_multi_word(self):
     self.assertEqual(
         StringCleaner.remove_multiple_whitespace(' Brown Beer '),
         'brown beer')
 def test_clean_word_upper_and_spaces(self):
     self.assertEqual(StringCleaner.remove_multiple_whitespace(' BEEr'),
                      'beer')
 def test_clean_word_space_left(self):
     self.assertEqual(StringCleaner.remove_multiple_whitespace(' beer'),
                      'beer')
 def test_make_string_alphanumeric(self):
     alphanumneric_string = StringCleaner.make_string_alphanumeric(
         self.unicode_string)
     expected_string = 'food was amamazing. The place was  good.'
     self.assertEqual(alphanumneric_string, expected_string)
 def test_clean(self):
     clean_str = StringCleaner.clean(self.unicode_string)
     exptected_clean_str = 'a food was camc amazing! . the place was good.'
     print('#', clean_str, '#')
     self.assertEqual(exptected_clean_str, clean_str)