def test_too_many_counts(self): fixture_path = os.path.join(self._fixtures_dir,'22kAmazonGameReview.txt') words = filehandler.convert_to_txt(fixture_path) counts = wordhandler.get_word_counts(words,True,True,'english') self.assertEqual(len(counts[0]),wordhandler.MAX_ITEMS) self.assertEqual(len(counts[1]),wordhandler.MAX_ITEMS) self.assertEqual(len(counts[2]),wordhandler.MAX_ITEMS)
def test_too_many_counts(self): fixture_path = os.path.join(self._fixtures_dir, '22kAmazonGameReview.txt') words = filehandler.convert_to_txt(fixture_path) counts = wordhandler.get_word_counts(words, True, True, 'english') self.assertEqual(len(counts[0]), wordhandler.MAX_ITEMS) self.assertEqual(len(counts[1]), wordhandler.MAX_ITEMS) self.assertEqual(len(counts[2]), wordhandler.MAX_ITEMS)
def process_words(words, ignore_case, ignore_stopwords, is_sample): stopwords_language = 'english' if g.current_lang == 'es': stopwords_language = 'spanish' elif g.current_lang == 'pt': stopwords_language = 'portuguese' elif g.current_lang == 'hu': stopwords_language = 'hungarian' counts = wordhandler.get_word_counts(words, ignore_case, ignore_stopwords, stopwords_language) return counts
def process_words(words, ignore_case, ignore_stopwords, is_sample): stopwords_language = 'english' if g.current_lang == 'es': stopwords_language = 'spanish' elif g.current_lang == 'pt': stopwords_language = 'portuguese' elif g.current_lang == 'hu': stopwords_language = 'hungarian' counts = wordhandler.get_word_counts( words, ignore_case, ignore_stopwords, stopwords_language) return counts
def _process_words(words, ignore_case, ignore_stopwords, is_sample): stopwords_language = NLTK_STOPWORDS_BY_LANGUAGE[g.current_lang] counts = wordhandler.get_word_counts(words, ignore_case, ignore_stopwords, stopwords_language) return counts
def get_summary(self, language): summary_start = time.time() results = {} if not self._csv_has_rows(self.input_path): results['row_count'] = 0 results['columns'] = [] return results start_time = time.time() delim = self.detectDelimiter() try: tab = table.Table.from_csv(self.input_file, delimiter=delim, quotechar='"') except Exception as e: logger.debug("Error making a table from the CSV") logger.error(e) return 'bad_formatting' logger.debug(" %f ms to create table from csv" % (1000*(time.time()-start_time))) row_count = tab.count_rows() + 1 # this value is inaccurate so I'm adding 1 if self.has_header_row: row_count -= 1 results['row_count'] = row_count logger.debug(" found %d rows" % row_count) column_count = len(tab) empty_header_count = 0 results['columns'] = [] for c in tab: logger.debug(" column: %s" % c.name) """ skip over columns that don't have headers also count the number of columns without headers and if all columns are missing headers, tell the user that the csv is poorly formatted """ if c.name == '_unnamed': empty_header_count += 1 if empty_header_count == column_count: return 'bad_formatting' continue column_info = {} column_info['index'] = c.order + 1 column_info['name'] = c.name values = sorted([i for i in c if i is not None]) stats = {} # figure out what type the column is start_time = time.time() date_count = 0 time_count = 0 number_count = 0 value_count = len(values) if SAMPLE_FOR_TYPE and (value_count > 100): # try sampling to speed this up sampled_values = random.sample(values, 100) else: sampled_values = values sampled_value_count = len(sampled_values) for v in sampled_values: if type(v) in [float, int, complex] or self.is_number(v): number_count += 1 if self.is_date(v) is not None: v = self.is_date(v) if v.time() != datetime.time(0, 0): time_count += 1 if v.date() != datetime.date.today(): date_count += 1 if sampled_value_count > 0: date_percent = float(date_count) / float(sampled_value_count) time_percent = float(time_count) / float(sampled_value_count) number_percent = float(number_count) / float(sampled_value_count) else: date_percent = 0 time_percent = 0 number_percent = 0 threshold = 0.5 if number_percent < threshold: if date_percent > threshold: if time_percent > threshold: c.type = datetime.datetime else: c.type = datetime.date elif time_percent > threshold: c.type = datetime.time else: c.type = float logger.debug(" type is %s (%f ms)" % (c.type, (time.time()-start_time)*1000)) # clean the data, based on the type it is start_time = time.time() if c.type == datetime.datetime or c.type == datetime.date or c.type == datetime.time: old_len = len(values) values = [ self.is_date(v).replace(tzinfo=None) for v in values if self.is_date(v) is not None ] new_len = len(values) #logger.debug(" removed %d bad values" % (old_len-new_len)) elif c.type == float: old_len = len(values) values = [ self.is_number(v) for v in values if self.is_number(v) is not None ] new_len = len(values) #logger.debug(" removed %d bad values" % (old_len-new_len)) elif c.type == str: old_len = len(values) values = [ v for v in values if v != ' ' ] new_len = len(values) #logger.debug(" removed %d bad values" % (old_len-new_len)) #logger.debug(" cleaned in %f ms" % ((time.time()-start_time)*1000)) # do the default operations on the values start_time = time.time() for op in OPERATIONS: op_start_time = time.time() stats[op] = getattr(self, 'get_%s' % op)(c, values, stats) #logger.debug(" %s in %f" % (op,(time.time()-op_start_time)*1000)) #logger.debug(" default ops took %f ms" % ((time.time()-start_time)*1000)) if c.type == None: column_info['type'] = 'empty' continue column_info['type'] = c.type.__name__ column_info['nulls'] = stats['nulls'] t = column_info['type'] dt = 'undefined' if any(t in s for s in ['float', 'int', 'long', 'complex']): dt = 'numbers' if 'str' in t: dt = 'text' if 'time' in t: dt = 'times' if 'date' in t: dt = 'dates' if 'datetime' in t: dt = 'dates and times' if 'bool' in t: dt = 'booleans' column_info['display_type_name'] = dt if dt in ['numbers', 'dates', 'times', 'dates and times']: if len(stats['unique']) <= NUMBER_MAX_UNIQUE: column_info['most_freq_values'] = sorted(self.get_most_freq_values(stats), key=itemgetter('value')) else: column_info['uniques'] = len(stats['unique']) column_info['min'] = stats['min'] column_info['max'] = stats['max'] column_info['deciles'] = stats['deciles'] if dt in 'numbers': column_info['sum'] = stats['sum'] column_info['mean'] = stats['mean'] column_info['median'] = stats['median'] column_info['stdev'] = stats['stdev'] else: # if there are few unique values, get every value and their frequency if len(stats['unique']) <= MAX_UNIQUE and c.type is not bool: column_info['values'] = self.get_most_freq_values(stats) column_info['most_freq_values'] = self.get_most_freq_values(stats) else: column_info['uniques'] = len(stats['unique']) # get the most frequent repeating values, if any if column_info['uniques'] != len(values): column_info['most_freq_values'] = self.get_most_freq_values(stats) if c.type is not bool: column_info['others'] = stats['others'] # for text columns, get the longest string if c.type == six.text_type: column_info['max_str_len'] = stats['len'] if 'str' in column_info['type'] and not 'most_freq_values' in column_info: # TODO: these results could be cleaned up using textmining # TODO: send in the language properly? stopwords_language = NLTK_STOPWORDS_BY_LANGUAGE[language] column_info['word_counts'] = wordhandler.get_word_counts( str([s for s in values]).strip('[]').replace("u'", '').replace("',", ''), True, True, stopwords_language, False, False) results['columns'].append( column_info ) logger.debug(" done in %f ms" % ((time.time()-summary_start)*1000 )) return results