Exemplo n.º 1
0
 def test_too_many_counts(self):
     fixture_path = os.path.join(self._fixtures_dir,'22kAmazonGameReview.txt')
     words = filehandler.convert_to_txt(fixture_path)
     counts = wordhandler.get_word_counts(words,True,True,'english')
     self.assertEqual(len(counts[0]),wordhandler.MAX_ITEMS)
     self.assertEqual(len(counts[1]),wordhandler.MAX_ITEMS)
     self.assertEqual(len(counts[2]),wordhandler.MAX_ITEMS)
Exemplo n.º 2
0
 def test_too_many_counts(self):
     fixture_path = os.path.join(self._fixtures_dir,
                                 '22kAmazonGameReview.txt')
     words = filehandler.convert_to_txt(fixture_path)
     counts = wordhandler.get_word_counts(words, True, True, 'english')
     self.assertEqual(len(counts[0]), wordhandler.MAX_ITEMS)
     self.assertEqual(len(counts[1]), wordhandler.MAX_ITEMS)
     self.assertEqual(len(counts[2]), wordhandler.MAX_ITEMS)
Exemplo n.º 3
0
def process_words(words, ignore_case, ignore_stopwords, is_sample):
    stopwords_language = 'english'

    if g.current_lang == 'es':
        stopwords_language = 'spanish'
    elif g.current_lang == 'pt':
        stopwords_language = 'portuguese'
    elif g.current_lang == 'hu':
        stopwords_language = 'hungarian'

    counts = wordhandler.get_word_counts(words, ignore_case, ignore_stopwords,
                                         stopwords_language)
    return counts
Exemplo n.º 4
0
def process_words(words, ignore_case, ignore_stopwords, is_sample):
    stopwords_language = 'english'
    
    if g.current_lang == 'es':
        stopwords_language = 'spanish'
    elif g.current_lang == 'pt':
        stopwords_language = 'portuguese'
    elif g.current_lang == 'hu':
        stopwords_language = 'hungarian'

    counts = wordhandler.get_word_counts(
        words,
        ignore_case,
        ignore_stopwords,
        stopwords_language)
    return counts
Exemplo n.º 5
0
def _process_words(words, ignore_case, ignore_stopwords, is_sample):
    stopwords_language = NLTK_STOPWORDS_BY_LANGUAGE[g.current_lang]
    counts = wordhandler.get_word_counts(words, ignore_case, ignore_stopwords,
                                         stopwords_language)
    return counts
Exemplo n.º 6
0
    def get_summary(self, language):
        summary_start = time.time()
        results = {}
        
        if not self._csv_has_rows(self.input_path):
            results['row_count'] = 0
            results['columns'] = []
            return results

        start_time = time.time()
        delim = self.detectDelimiter()

        try:
            tab = table.Table.from_csv(self.input_file, delimiter=delim, quotechar='"')
        except Exception as e:
            logger.debug("Error making a table from the CSV")
            logger.error(e)
            return 'bad_formatting'

        logger.debug("  %f ms to create table from csv" % (1000*(time.time()-start_time)))

        row_count = tab.count_rows() + 1 # this value is inaccurate so I'm adding 1
        if self.has_header_row:
            row_count -= 1
        results['row_count'] = row_count
        logger.debug("  found %d rows" % row_count)

        column_count = len(tab)
        empty_header_count = 0
        
        results['columns'] = []
        for c in tab:
            logger.debug("  column: %s" % c.name)

            """
            skip over columns that don't have headers
            also count the number of columns without headers
            and if all columns are missing headers, tell the user that the csv is poorly formatted
            """
            if c.name == '_unnamed':
                empty_header_count += 1
                if empty_header_count == column_count:
                    return 'bad_formatting'
                continue

            column_info = {}
            column_info['index'] = c.order + 1
            column_info['name'] = c.name
            
            values = sorted([i for i in c if i is not None])

            stats = {} 

            # figure out what type the column is
            start_time = time.time()

            date_count = 0
            time_count = 0
            number_count = 0
            value_count = len(values)
            if SAMPLE_FOR_TYPE and (value_count > 100):    # try sampling to speed this up
                sampled_values = random.sample(values, 100)
            else: 
                sampled_values = values
            sampled_value_count = len(sampled_values)

            for v in sampled_values:
                if type(v) in [float, int, complex] or self.is_number(v):
                    number_count += 1
                if self.is_date(v) is not None:
                    v = self.is_date(v)
                    if v.time() != datetime.time(0, 0):
                        time_count += 1
                    if v.date() != datetime.date.today():
                        date_count += 1

            if sampled_value_count > 0:
                date_percent = float(date_count) / float(sampled_value_count)
                time_percent = float(time_count) / float(sampled_value_count)
                number_percent = float(number_count) / float(sampled_value_count)
            else:
                date_percent = 0
                time_percent = 0
                number_percent = 0
                
            threshold = 0.5

            if number_percent < threshold:
                if date_percent > threshold:
                    if time_percent > threshold:
                        c.type = datetime.datetime
                    else:
                        c.type = datetime.date
                elif time_percent > threshold:
                    c.type = datetime.time
            else:
                c.type = float

            logger.debug("    type is %s (%f ms)" % (c.type, (time.time()-start_time)*1000))

            # clean the data, based on the type it is
            start_time = time.time()
            if c.type == datetime.datetime or c.type == datetime.date or c.type == datetime.time:
                old_len = len(values)
                values = [ self.is_date(v).replace(tzinfo=None) for v in values if self.is_date(v) is not None ]
                new_len = len(values)
                #logger.debug("    removed %d bad values" % (old_len-new_len))
            elif c.type == float:
                old_len = len(values)
                values = [ self.is_number(v) for v in values if self.is_number(v) is not None ]
                new_len = len(values)
                #logger.debug("    removed %d bad values" % (old_len-new_len))
            elif c.type == str:
                old_len = len(values)
                values = [ v for v in values if v != '&nbsp;' ]
                new_len = len(values)
                #logger.debug("    removed %d bad values" % (old_len-new_len))
            #logger.debug("    cleaned in %f ms" % ((time.time()-start_time)*1000))

            # do the default operations on the values
            start_time = time.time()
            for op in OPERATIONS:
                op_start_time = time.time()
                stats[op] = getattr(self, 'get_%s' % op)(c, values, stats)
                #logger.debug("      %s in %f" % (op,(time.time()-op_start_time)*1000))
            #logger.debug("    default ops took %f ms" % ((time.time()-start_time)*1000))

            if c.type == None:
                column_info['type'] = 'empty'
                continue
                
            column_info['type'] = c.type.__name__
            column_info['nulls'] = stats['nulls']

            t = column_info['type']
            dt = 'undefined'
            if any(t in s for s in ['float', 'int', 'long', 'complex']):
                dt = 'numbers'
            if 'str' in t:
                dt = 'text'
            if 'time' in t:
                dt = 'times'
            if 'date' in t:
                dt = 'dates'
            if 'datetime' in t:
                dt = 'dates and times'
            if 'bool' in t:
                dt = 'booleans'
            column_info['display_type_name'] = dt

            if dt in ['numbers', 'dates', 'times', 'dates and times']:
                if len(stats['unique']) <= NUMBER_MAX_UNIQUE:
                    column_info['most_freq_values'] = sorted(self.get_most_freq_values(stats), key=itemgetter('value'))
                else:
                    column_info['uniques'] = len(stats['unique'])
                    column_info['min'] = stats['min']
                    column_info['max'] = stats['max']
                    column_info['deciles'] = stats['deciles']
                    if dt in 'numbers':
                        column_info['sum'] = stats['sum']
                        column_info['mean'] = stats['mean']
                        column_info['median'] = stats['median']
                        column_info['stdev'] = stats['stdev']
            else:
                # if there are few unique values, get every value and their frequency
                if len(stats['unique']) <= MAX_UNIQUE and c.type is not bool:
                    column_info['values'] = self.get_most_freq_values(stats)
                    column_info['most_freq_values'] = self.get_most_freq_values(stats)
                else:
                    column_info['uniques'] = len(stats['unique'])

                    # get the most frequent repeating values, if any
                    if column_info['uniques'] != len(values):
                        column_info['most_freq_values'] = self.get_most_freq_values(stats)
                        if c.type is not bool:
                            column_info['others'] = stats['others']

                    # for text columns, get the longest string
                    if c.type == six.text_type:
                        column_info['max_str_len'] = stats['len']

            if 'str' in column_info['type'] and not 'most_freq_values' in column_info:
                # TODO: these results could be cleaned up using textmining
                # TODO: send in the language properly?
                stopwords_language = NLTK_STOPWORDS_BY_LANGUAGE[language]
                column_info['word_counts'] = wordhandler.get_word_counts(
                    str([s for s in values]).strip('[]').replace("u'", '').replace("',", ''),
                    True, True, stopwords_language, False, False)

            results['columns'].append( column_info )

        logger.debug("  done in %f ms" % ((time.time()-summary_start)*1000 ))
        return results