예제 #1
0
def wiki_proportions_chart(path, prefixes):
    prefixes, sizes = zip(*sorted([(pr, dumpSize(pr)) for pr in prefixes],
                                  key=operator.itemgetter(1)))

    blockSize = 5
    ind = p.arange(0, blockSize * len(prefixes),
                   blockSize)  # y location for groups
    height = 4  # bar height

    #colors = ['g','r','c','m','y']
    thresholds = [5000, 2000, 1000, 500, 200, 100, 50, 20, 10]
    colors = [
        str(float(i + 1) / (len(thresholds) + 1))
        for i in xrange(len(thresholds))
    ]
    colors.reverse()

    p.clf()
    """
  overall = p.barh( ind 
                  , [1.0] * len(ind) 
                  , height
                  , color = 'b'
                  , linewidth = 0
                  , align='center'
                  )
  """
    subbars = []
    for i, thresh in enumerate(thresholds):
        subbars.append(
            p.barh(ind, [
                float(docs_under_thresh(pr, thresh)) / dumpSize(pr)
                for pr in prefixes
            ],
                   height,
                   color=colors[i % len(colors)],
                   linewidth=0,
                   align='center'))

    p.ylim(-height, len(prefixes) * blockSize)
    p.xlim(0, 1)
    yfontprop = FontProperties(size=4)
    xfontprop = FontProperties(size=4)
    p.xlabel('Proportion')
    p.ylabel('Language Code')
    p.title('Proportion of Documents Under Threshold')
    p.yticks(ind, prefixes, fontproperties=yfontprop)
    xmin, xmax = p.xlim()
    xtick_interval = 0.1
    p.xticks(p.arange(xmin, xmax, xtick_interval), fontproperties=xfontprop)
    p.gca().xaxis.grid(linestyle='-', linewidth=0.15)
    p.gca().yaxis.tick_left()
    p.savefig(path, dpi=300)
    p.close()
    p.clf()
예제 #2
0
파일: analysis.py 프로젝트: adamar/wikidump
def wiki_proportions_chart(path, prefixes):
  prefixes, sizes = zip(*sorted( [(pr, dumpSize(pr)) for pr in prefixes]
                               , key = operator.itemgetter(1)
                               )
                       )

  blockSize = 5 
  ind = p.arange(0, blockSize*len(prefixes), blockSize) # y location for groups
  height = 4 # bar height 

  #colors = ['g','r','c','m','y']
  thresholds = [5000, 2000,1000,500,200,100,50,20,10]
  colors = [str(float(i+1) / (len(thresholds)+1)) for i in xrange(len(thresholds))]
  colors.reverse()

  p.clf()
  """
  overall = p.barh( ind 
                  , [1.0] * len(ind) 
                  , height
                  , color = 'b'
                  , linewidth = 0
                  , align='center'
                  )
  """
  subbars = []
  for i, thresh in enumerate(thresholds) :
    subbars.append( p.barh( ind
                          , [ float(docs_under_thresh(pr, thresh)) / dumpSize(pr) for pr in prefixes]
                          , height
                          , color = colors[ i % len(colors) ] 
                          , linewidth = 0
                          , align='center'
                          )
                  )
  
  p.ylim(-height, len(prefixes) * blockSize)
  p.xlim(0, 1)
  yfontprop = FontProperties(size=4)
  xfontprop = FontProperties(size=4)
  p.xlabel('Proportion')
  p.ylabel('Language Code')
  p.title('Proportion of Documents Under Threshold')
  p.yticks(ind, prefixes, fontproperties = yfontprop)
  xmin, xmax = p.xlim()
  xtick_interval         = 0.1 
  p.xticks( p.arange(xmin,xmax,xtick_interval),fontproperties = xfontprop)
  p.gca().xaxis.grid(linestyle = '-', linewidth=0.15)
  p.gca().yaxis.tick_left()
  p.savefig(path, dpi=300)
  p.close()
  p.clf()
예제 #3
0
파일: analysis.py 프로젝트: adamar/wikidump
def wiki_sizes_chart(path, prefixes, upperlimit = None ):
  prefixes, sizes = zip(*sorted( [(pr, dumpSize(pr)) for pr in prefixes]
                               , key = operator.itemgetter(1)
                               )
                       )

  blockSize = 5 
  ind = p.arange(0, blockSize*len(prefixes), blockSize) # y location for groups
  height = 4 # bar height 

  #colors = ['g','r','c','m','y']
  colors = html_colors

  thresholds = [5000, 2000,1000,500,200,100,50,20,10]
  #colors = [str(float(i+1) / (len(thresholds)+1)) for i in xrange(len(thresholds))]
  #colors.reverse()

  overall = p.barh( ind 
                  , sizes
                  , height
                  , color = 'b'
                  , linewidth = 0
                  , align='center'
                  )
  subbars = []
  for i, thresh in enumerate(thresholds) :
    subbars.append( p.barh( ind
                          , [ docs_under_thresh(pr, thresh) for pr in prefixes]
                          , height
                          , color = colors[ i % len(colors) ] 
                          , linewidth = 0
                          , align='center'
                          )
                  )
  
  p.ylim(-height, len(prefixes) * blockSize)
  if upperlimit:
    p.xlim(0, upperlimit)
  yfontprop = FontProperties(size=4)
  xfontprop = FontProperties(size=4)
  p.xlabel('Documents')
  p.ylabel('Language Code')
  p.title('Number of Documents Under Threshold')
  p.yticks(ind, prefixes, fontproperties = yfontprop)
  xmin, xmax = p.xlim()
  xtick_interval         = rounded_interval(xmin, xmax, 20, 2) 
  p.xticks( p.arange(xmin,xmax,xtick_interval),fontproperties = xfontprop)
  p.gca().xaxis.grid(linestyle = '-', linewidth=0.15)
  p.gca().yaxis.tick_left()
  p.legend( [ b[0] for b in subbars]
          , map(str,thresholds)
          , prop = xfontprop
          , loc = 'lower right' 
          )


  p.savefig(path, dpi=300)
  p.close()
  p.clf()
예제 #4
0
def wiki_sizes_chart(path, prefixes, upperlimit=None):
    prefixes, sizes = zip(*sorted([(pr, dumpSize(pr)) for pr in prefixes],
                                  key=operator.itemgetter(1)))

    blockSize = 5
    ind = p.arange(0, blockSize * len(prefixes),
                   blockSize)  # y location for groups
    height = 4  # bar height

    #colors = ['g','r','c','m','y']
    colors = html_colors

    thresholds = [5000, 2000, 1000, 500, 200, 100, 50, 20, 10]
    #colors = [str(float(i+1) / (len(thresholds)+1)) for i in xrange(len(thresholds))]
    #colors.reverse()

    overall = p.barh(ind,
                     sizes,
                     height,
                     color='b',
                     linewidth=0,
                     align='center')
    subbars = []
    for i, thresh in enumerate(thresholds):
        subbars.append(
            p.barh(ind, [docs_under_thresh(pr, thresh) for pr in prefixes],
                   height,
                   color=colors[i % len(colors)],
                   linewidth=0,
                   align='center'))

    p.ylim(-height, len(prefixes) * blockSize)
    if upperlimit:
        p.xlim(0, upperlimit)
    yfontprop = FontProperties(size=4)
    xfontprop = FontProperties(size=4)
    p.xlabel('Documents')
    p.ylabel('Language Code')
    p.title('Number of Documents Under Threshold')
    p.yticks(ind, prefixes, fontproperties=yfontprop)
    xmin, xmax = p.xlim()
    xtick_interval = rounded_interval(xmin, xmax, 20, 2)
    p.xticks(p.arange(xmin, xmax, xtick_interval), fontproperties=xfontprop)
    p.gca().xaxis.grid(linestyle='-', linewidth=0.15)
    p.gca().yaxis.tick_left()
    p.legend([b[0] for b in subbars],
             map(str, thresholds),
             prop=xfontprop,
             loc='lower right')

    p.savefig(path, dpi=300)
    p.close()
    p.clf()
예제 #5
0
def create_dataset(num_doc,
                   distribution,
                   num_seg,
                   text_transformer,
                   exclude=[],
                   seed=None):
    """
  Create a multilingual dataset from wikipedia data based on segments
  of monolingual documents.

  @param num_doc: number of documents in intended dataset
  @param distribution: Mapping from class name to proportion
  @param segments: Sequence of segments and their relative sizes
  @param exclude: Docids to be excluded from the final dataset
  """
    logger = logging.getLogger('wikidump.segment.create_dataset')
    logger.info('Creating dataset')
    docmap = {}
    classmap = {}
    # Set up a parser for each class
    parser = {}
    classlabels = []
    class_probs = numpy.empty(len(distribution), dtype=float)
    dump_indices = []
    denominator = 0
    logger.info('Setting up parsers')
    for i, classlabel in enumerate(distribution):
        parser[classlabel] = page_parser(classlabel)
        classlabels.append(classlabel)
        denominator += distribution[classlabel]
        class_probs[i] = denominator
        dumpsize = dumpSize(classlabel)
        indices = range(dumpsize)
        random.shuffle(indices)
        dump_indices.append(indices)
        logger.debug('Done for %s, %d indices', classlabel, dumpsize)
    logger.info('Parsers ready')

    # Normalize to a CDF
    class_probs /= denominator

    random.seed(seed)

    logger.info('Obtaining documents')
    for doc_num in range(num_doc):
        doc = ""
        classes = set()
        segids = []
        for seg_num in range(num_seg):
            seg_class_index = numpy.searchsorted(class_probs, random.random())
            seg_class = classlabels[seg_class_index]
            classes.add(seg_class)
            doc_index = dump_indices[seg_class_index].pop(0)
            segid = "_".join((seg_class, str(doc_index)))
            while segid in exclude:
                logger.debug('Ignored %s: in exclude', segid)
                try:
                    doc_index = dump_indices[seg_class_index].pop(0)
                except IndexError:
                    raise ValueError, "No more documents in class %s available" % classlabel
                segid = "_".join((seg_class, str(doc_index)))
            segids.append(segid)
            content = text_transformer(parser[seg_class].get_entry(doc_index))
            seg_size = len(content) / num_seg
            seg_start = seg_size * seg_num
            seg_end = seg_start + seg_size
            doc += content[seg_start:seg_end]
        docid = '-'.join(segids)
        docmap[docid] = doc
        classmap[docid] = list(classes)
        logger.debug('Index: %d ID: %s', doc_num, docid)
    return docmap, classmap
예제 #6
0
                    raise ValueError, "No more documents in class %s available" % classlabel
                segid = "_".join((seg_class, str(doc_index)))
            segids.append(segid)
            content = text_transformer(parser[seg_class].get_entry(doc_index))
            seg_size = len(content) / num_seg
            seg_start = seg_size * seg_num
            seg_end = seg_start + seg_size
            doc += content[seg_start:seg_end]
        docid = '-'.join(segids)
        docmap[docid] = doc
        classmap[docid] = list(classes)
        logger.debug('Index: %d ID: %s', doc_num, docid)
    return docmap, classmap


if __name__ == "__main__":
    picklefile = open('segment.pickle', "w")
    distribution = {}
    for p in all_prefixes:
        if len(p) > 3:
            logger.warning('Ignoring %s', p)
        else:
            distribution[p] = dumpSize(p)
    data = create_dataset(5000,
                          distribution,
                          2,
                          strip_mediawiki_markup,
                          exclude=[],
                          seed=61383441363)
    dump(data, picklefile)
예제 #7
0
파일: segment.py 프로젝트: saffsd/wikidump
def create_dataset(num_doc, distribution, num_seg, text_transformer, exclude=[], seed=None):
    """
  Create a multilingual dataset from wikipedia data based on segments
  of monolingual documents.

  @param num_doc: number of documents in intended dataset
  @param distribution: Mapping from class name to proportion
  @param segments: Sequence of segments and their relative sizes
  @param exclude: Docids to be excluded from the final dataset
  """
    logger = logging.getLogger("wikidump.segment.create_dataset")
    logger.info("Creating dataset")
    docmap = {}
    classmap = {}
    # Set up a parser for each class
    parser = {}
    classlabels = []
    class_probs = numpy.empty(len(distribution), dtype=float)
    dump_indices = []
    denominator = 0
    logger.info("Setting up parsers")
    for i, classlabel in enumerate(distribution):
        parser[classlabel] = page_parser(classlabel)
        classlabels.append(classlabel)
        denominator += distribution[classlabel]
        class_probs[i] = denominator
        dumpsize = dumpSize(classlabel)
        indices = range(dumpsize)
        random.shuffle(indices)
        dump_indices.append(indices)
        logger.debug("Done for %s, %d indices", classlabel, dumpsize)
    logger.info("Parsers ready")

    # Normalize to a CDF
    class_probs /= denominator

    random.seed(seed)

    logger.info("Obtaining documents")
    for doc_num in range(num_doc):
        doc = ""
        classes = set()
        segids = []
        for seg_num in range(num_seg):
            seg_class_index = numpy.searchsorted(class_probs, random.random())
            seg_class = classlabels[seg_class_index]
            classes.add(seg_class)
            doc_index = dump_indices[seg_class_index].pop(0)
            segid = "_".join((seg_class, str(doc_index)))
            while segid in exclude:
                logger.debug("Ignored %s: in exclude", segid)
                try:
                    doc_index = dump_indices[seg_class_index].pop(0)
                except IndexError:
                    raise ValueError, "No more documents in class %s available" % classlabel
                segid = "_".join((seg_class, str(doc_index)))
            segids.append(segid)
            content = text_transformer(parser[seg_class].get_entry(doc_index))
            seg_size = len(content) / num_seg
            seg_start = seg_size * seg_num
            seg_end = seg_start + seg_size
            doc += content[seg_start:seg_end]
        docid = "-".join(segids)
        docmap[docid] = doc
        classmap[docid] = list(classes)
        logger.debug("Index: %d ID: %s", doc_num, docid)
    return docmap, classmap
예제 #8
0
파일: segment.py 프로젝트: saffsd/wikidump
            while segid in exclude:
                logger.debug("Ignored %s: in exclude", segid)
                try:
                    doc_index = dump_indices[seg_class_index].pop(0)
                except IndexError:
                    raise ValueError, "No more documents in class %s available" % classlabel
                segid = "_".join((seg_class, str(doc_index)))
            segids.append(segid)
            content = text_transformer(parser[seg_class].get_entry(doc_index))
            seg_size = len(content) / num_seg
            seg_start = seg_size * seg_num
            seg_end = seg_start + seg_size
            doc += content[seg_start:seg_end]
        docid = "-".join(segids)
        docmap[docid] = doc
        classmap[docid] = list(classes)
        logger.debug("Index: %d ID: %s", doc_num, docid)
    return docmap, classmap


if __name__ == "__main__":
    picklefile = open("segment.pickle", "w")
    distribution = {}
    for p in all_prefixes:
        if len(p) > 3:
            logger.warning("Ignoring %s", p)
        else:
            distribution[p] = dumpSize(p)
    data = create_dataset(5000, distribution, 2, strip_mediawiki_markup, exclude=[], seed=61383441363)
    dump(data, picklefile)