def compile_data_points(self, **kwargs): letters = kwargs.get("letters", None) freq_iterator = FrequencyIterator( inDir=self.in_dir, outDir=None, letters=letters, message="Compiling data points" ) # Keys will be wordclass values (NN, NNS, etc.); values will # be a list of data points self.data_points = defaultdict(list) for entry in freq_iterator.iterate(): if entry.gram_count() == 1: # and len(entry.lex_items) == 1: lex_items = self.largest_in_each_wordclass(entry.lex_items) for item in lex_items: for period in item.frequency_table().data: start, end = PERIODS[period] lifespan = start - item.start if lifespan >= -20: wc = wordclass_category(item.wordclass) row = ( item.size(date=start), int(lifespan), start, item.frequency_table().frequency(period=period), ) self.data_points[wc].append(row) self.data_points["ALL"].append(row) for wordclass in self.data_points: self.data_points[wordclass].sort(key=lambda p: p[0]) filepath = os.path.join(PREDICTIONS_DIR, wordclass + ".txt") with (open(filepath, "w")) as fh: for data_point in self.data_points[wordclass]: fh.write("%0.3g\t%d\t%d\t%0.4g\n" % data_point)
def keyfunc(lex_item): return wordclass_category(lex_item.wordclass)