def wiki_proportions_chart(path, prefixes): prefixes, sizes = zip(*sorted([(pr, dumpSize(pr)) for pr in prefixes], key=operator.itemgetter(1))) blockSize = 5 ind = p.arange(0, blockSize * len(prefixes), blockSize) # y location for groups height = 4 # bar height #colors = ['g','r','c','m','y'] thresholds = [5000, 2000, 1000, 500, 200, 100, 50, 20, 10] colors = [ str(float(i + 1) / (len(thresholds) + 1)) for i in xrange(len(thresholds)) ] colors.reverse() p.clf() """ overall = p.barh( ind , [1.0] * len(ind) , height , color = 'b' , linewidth = 0 , align='center' ) """ subbars = [] for i, thresh in enumerate(thresholds): subbars.append( p.barh(ind, [ float(docs_under_thresh(pr, thresh)) / dumpSize(pr) for pr in prefixes ], height, color=colors[i % len(colors)], linewidth=0, align='center')) p.ylim(-height, len(prefixes) * blockSize) p.xlim(0, 1) yfontprop = FontProperties(size=4) xfontprop = FontProperties(size=4) p.xlabel('Proportion') p.ylabel('Language Code') p.title('Proportion of Documents Under Threshold') p.yticks(ind, prefixes, fontproperties=yfontprop) xmin, xmax = p.xlim() xtick_interval = 0.1 p.xticks(p.arange(xmin, xmax, xtick_interval), fontproperties=xfontprop) p.gca().xaxis.grid(linestyle='-', linewidth=0.15) p.gca().yaxis.tick_left() p.savefig(path, dpi=300) p.close() p.clf()
def wiki_proportions_chart(path, prefixes): prefixes, sizes = zip(*sorted( [(pr, dumpSize(pr)) for pr in prefixes] , key = operator.itemgetter(1) ) ) blockSize = 5 ind = p.arange(0, blockSize*len(prefixes), blockSize) # y location for groups height = 4 # bar height #colors = ['g','r','c','m','y'] thresholds = [5000, 2000,1000,500,200,100,50,20,10] colors = [str(float(i+1) / (len(thresholds)+1)) for i in xrange(len(thresholds))] colors.reverse() p.clf() """ overall = p.barh( ind , [1.0] * len(ind) , height , color = 'b' , linewidth = 0 , align='center' ) """ subbars = [] for i, thresh in enumerate(thresholds) : subbars.append( p.barh( ind , [ float(docs_under_thresh(pr, thresh)) / dumpSize(pr) for pr in prefixes] , height , color = colors[ i % len(colors) ] , linewidth = 0 , align='center' ) ) p.ylim(-height, len(prefixes) * blockSize) p.xlim(0, 1) yfontprop = FontProperties(size=4) xfontprop = FontProperties(size=4) p.xlabel('Proportion') p.ylabel('Language Code') p.title('Proportion of Documents Under Threshold') p.yticks(ind, prefixes, fontproperties = yfontprop) xmin, xmax = p.xlim() xtick_interval = 0.1 p.xticks( p.arange(xmin,xmax,xtick_interval),fontproperties = xfontprop) p.gca().xaxis.grid(linestyle = '-', linewidth=0.15) p.gca().yaxis.tick_left() p.savefig(path, dpi=300) p.close() p.clf()
def wiki_sizes_chart(path, prefixes, upperlimit = None ): prefixes, sizes = zip(*sorted( [(pr, dumpSize(pr)) for pr in prefixes] , key = operator.itemgetter(1) ) ) blockSize = 5 ind = p.arange(0, blockSize*len(prefixes), blockSize) # y location for groups height = 4 # bar height #colors = ['g','r','c','m','y'] colors = html_colors thresholds = [5000, 2000,1000,500,200,100,50,20,10] #colors = [str(float(i+1) / (len(thresholds)+1)) for i in xrange(len(thresholds))] #colors.reverse() overall = p.barh( ind , sizes , height , color = 'b' , linewidth = 0 , align='center' ) subbars = [] for i, thresh in enumerate(thresholds) : subbars.append( p.barh( ind , [ docs_under_thresh(pr, thresh) for pr in prefixes] , height , color = colors[ i % len(colors) ] , linewidth = 0 , align='center' ) ) p.ylim(-height, len(prefixes) * blockSize) if upperlimit: p.xlim(0, upperlimit) yfontprop = FontProperties(size=4) xfontprop = FontProperties(size=4) p.xlabel('Documents') p.ylabel('Language Code') p.title('Number of Documents Under Threshold') p.yticks(ind, prefixes, fontproperties = yfontprop) xmin, xmax = p.xlim() xtick_interval = rounded_interval(xmin, xmax, 20, 2) p.xticks( p.arange(xmin,xmax,xtick_interval),fontproperties = xfontprop) p.gca().xaxis.grid(linestyle = '-', linewidth=0.15) p.gca().yaxis.tick_left() p.legend( [ b[0] for b in subbars] , map(str,thresholds) , prop = xfontprop , loc = 'lower right' ) p.savefig(path, dpi=300) p.close() p.clf()
def wiki_sizes_chart(path, prefixes, upperlimit=None): prefixes, sizes = zip(*sorted([(pr, dumpSize(pr)) for pr in prefixes], key=operator.itemgetter(1))) blockSize = 5 ind = p.arange(0, blockSize * len(prefixes), blockSize) # y location for groups height = 4 # bar height #colors = ['g','r','c','m','y'] colors = html_colors thresholds = [5000, 2000, 1000, 500, 200, 100, 50, 20, 10] #colors = [str(float(i+1) / (len(thresholds)+1)) for i in xrange(len(thresholds))] #colors.reverse() overall = p.barh(ind, sizes, height, color='b', linewidth=0, align='center') subbars = [] for i, thresh in enumerate(thresholds): subbars.append( p.barh(ind, [docs_under_thresh(pr, thresh) for pr in prefixes], height, color=colors[i % len(colors)], linewidth=0, align='center')) p.ylim(-height, len(prefixes) * blockSize) if upperlimit: p.xlim(0, upperlimit) yfontprop = FontProperties(size=4) xfontprop = FontProperties(size=4) p.xlabel('Documents') p.ylabel('Language Code') p.title('Number of Documents Under Threshold') p.yticks(ind, prefixes, fontproperties=yfontprop) xmin, xmax = p.xlim() xtick_interval = rounded_interval(xmin, xmax, 20, 2) p.xticks(p.arange(xmin, xmax, xtick_interval), fontproperties=xfontprop) p.gca().xaxis.grid(linestyle='-', linewidth=0.15) p.gca().yaxis.tick_left() p.legend([b[0] for b in subbars], map(str, thresholds), prop=xfontprop, loc='lower right') p.savefig(path, dpi=300) p.close() p.clf()
def create_dataset(num_doc, distribution, num_seg, text_transformer, exclude=[], seed=None): """ Create a multilingual dataset from wikipedia data based on segments of monolingual documents. @param num_doc: number of documents in intended dataset @param distribution: Mapping from class name to proportion @param segments: Sequence of segments and their relative sizes @param exclude: Docids to be excluded from the final dataset """ logger = logging.getLogger('wikidump.segment.create_dataset') logger.info('Creating dataset') docmap = {} classmap = {} # Set up a parser for each class parser = {} classlabels = [] class_probs = numpy.empty(len(distribution), dtype=float) dump_indices = [] denominator = 0 logger.info('Setting up parsers') for i, classlabel in enumerate(distribution): parser[classlabel] = page_parser(classlabel) classlabels.append(classlabel) denominator += distribution[classlabel] class_probs[i] = denominator dumpsize = dumpSize(classlabel) indices = range(dumpsize) random.shuffle(indices) dump_indices.append(indices) logger.debug('Done for %s, %d indices', classlabel, dumpsize) logger.info('Parsers ready') # Normalize to a CDF class_probs /= denominator random.seed(seed) logger.info('Obtaining documents') for doc_num in range(num_doc): doc = "" classes = set() segids = [] for seg_num in range(num_seg): seg_class_index = numpy.searchsorted(class_probs, random.random()) seg_class = classlabels[seg_class_index] classes.add(seg_class) doc_index = dump_indices[seg_class_index].pop(0) segid = "_".join((seg_class, str(doc_index))) while segid in exclude: logger.debug('Ignored %s: in exclude', segid) try: doc_index = dump_indices[seg_class_index].pop(0) except IndexError: raise ValueError, "No more documents in class %s available" % classlabel segid = "_".join((seg_class, str(doc_index))) segids.append(segid) content = text_transformer(parser[seg_class].get_entry(doc_index)) seg_size = len(content) / num_seg seg_start = seg_size * seg_num seg_end = seg_start + seg_size doc += content[seg_start:seg_end] docid = '-'.join(segids) docmap[docid] = doc classmap[docid] = list(classes) logger.debug('Index: %d ID: %s', doc_num, docid) return docmap, classmap
raise ValueError, "No more documents in class %s available" % classlabel segid = "_".join((seg_class, str(doc_index))) segids.append(segid) content = text_transformer(parser[seg_class].get_entry(doc_index)) seg_size = len(content) / num_seg seg_start = seg_size * seg_num seg_end = seg_start + seg_size doc += content[seg_start:seg_end] docid = '-'.join(segids) docmap[docid] = doc classmap[docid] = list(classes) logger.debug('Index: %d ID: %s', doc_num, docid) return docmap, classmap if __name__ == "__main__": picklefile = open('segment.pickle', "w") distribution = {} for p in all_prefixes: if len(p) > 3: logger.warning('Ignoring %s', p) else: distribution[p] = dumpSize(p) data = create_dataset(5000, distribution, 2, strip_mediawiki_markup, exclude=[], seed=61383441363) dump(data, picklefile)
def create_dataset(num_doc, distribution, num_seg, text_transformer, exclude=[], seed=None): """ Create a multilingual dataset from wikipedia data based on segments of monolingual documents. @param num_doc: number of documents in intended dataset @param distribution: Mapping from class name to proportion @param segments: Sequence of segments and their relative sizes @param exclude: Docids to be excluded from the final dataset """ logger = logging.getLogger("wikidump.segment.create_dataset") logger.info("Creating dataset") docmap = {} classmap = {} # Set up a parser for each class parser = {} classlabels = [] class_probs = numpy.empty(len(distribution), dtype=float) dump_indices = [] denominator = 0 logger.info("Setting up parsers") for i, classlabel in enumerate(distribution): parser[classlabel] = page_parser(classlabel) classlabels.append(classlabel) denominator += distribution[classlabel] class_probs[i] = denominator dumpsize = dumpSize(classlabel) indices = range(dumpsize) random.shuffle(indices) dump_indices.append(indices) logger.debug("Done for %s, %d indices", classlabel, dumpsize) logger.info("Parsers ready") # Normalize to a CDF class_probs /= denominator random.seed(seed) logger.info("Obtaining documents") for doc_num in range(num_doc): doc = "" classes = set() segids = [] for seg_num in range(num_seg): seg_class_index = numpy.searchsorted(class_probs, random.random()) seg_class = classlabels[seg_class_index] classes.add(seg_class) doc_index = dump_indices[seg_class_index].pop(0) segid = "_".join((seg_class, str(doc_index))) while segid in exclude: logger.debug("Ignored %s: in exclude", segid) try: doc_index = dump_indices[seg_class_index].pop(0) except IndexError: raise ValueError, "No more documents in class %s available" % classlabel segid = "_".join((seg_class, str(doc_index))) segids.append(segid) content = text_transformer(parser[seg_class].get_entry(doc_index)) seg_size = len(content) / num_seg seg_start = seg_size * seg_num seg_end = seg_start + seg_size doc += content[seg_start:seg_end] docid = "-".join(segids) docmap[docid] = doc classmap[docid] = list(classes) logger.debug("Index: %d ID: %s", doc_num, docid) return docmap, classmap
while segid in exclude: logger.debug("Ignored %s: in exclude", segid) try: doc_index = dump_indices[seg_class_index].pop(0) except IndexError: raise ValueError, "No more documents in class %s available" % classlabel segid = "_".join((seg_class, str(doc_index))) segids.append(segid) content = text_transformer(parser[seg_class].get_entry(doc_index)) seg_size = len(content) / num_seg seg_start = seg_size * seg_num seg_end = seg_start + seg_size doc += content[seg_start:seg_end] docid = "-".join(segids) docmap[docid] = doc classmap[docid] = list(classes) logger.debug("Index: %d ID: %s", doc_num, docid) return docmap, classmap if __name__ == "__main__": picklefile = open("segment.pickle", "w") distribution = {} for p in all_prefixes: if len(p) > 3: logger.warning("Ignoring %s", p) else: distribution[p] = dumpSize(p) data = create_dataset(5000, distribution, 2, strip_mediawiki_markup, exclude=[], seed=61383441363) dump(data, picklefile)