예제 #1
0
def process_semistructured(corpus_dir, outfile, language, processes,
                           sourced_only, genealogics, dump_unresolved):
    """ Processes the corpus and extracts semi-structured data serialized into QuickStatements.
        Needs a second pass on genealogics to correctly resolve family members.
    """

    resolver = SemistructuredSerializer(language, sourced_only, )

    genealogics_url_to_id, count, skipped = resolver.process_corpus(
        io.load_scraped_items(corpus_dir), outfile, dump_unresolved, genealogics, processes
    )

    logger.info('Done, produced %d statements, skipped %d names', count, skipped)
    if not genealogics:
        logger.info("Dataset serialized to '%s'" % outfile.name)
        return

    logger.info('Starting second pass on genealogics ...')
    genealogics_data = resolver.resolve_genealogics_family(genealogics, genealogics_url_to_id)
    for success, item in genealogics_data:
        if success:
            outfile.write(item.encode('utf8'))
            outfile.write('\n')

            count += 1
            if count % 10000 == 0:
                logger.info('Produced %d statements so far, skipped %d names', count, skipped)
        else:
            skipped += 1
            if dump_unresolved:
                dump_unresolved.write(json.dumps(item))
                dump_unresolved.write('\n')

    logger.info('Done, produced %d statements, skipped %d names', count, skipped)
    logger.info("Dataset serialized to '%s'" % outfile.name)
예제 #2
0
def about_biographies_count(corpus):
    """ Finds how many items have/don't have a biography
    """
    count = with_bio = characters = 0
    for doc in load_scraped_items(corpus):
        count += 1
        if doc.get('bio') and len(doc['bio']) > 5:
            with_bio += 1
            characters += len(doc['bio'])

    print 'Total number of items:', count
    print 'Items with a biography %d (%.2f %%)' % (with_bio,
                                                   100. * with_bio / count)
    print 'Cumulative length of biographies: %d characters' % characters

    try:
        import matplotlib.pyplot as plt
    except ImportError:
        logger.warn('Cannot import matplotlib, skipping chart')
        return

    plt.bar([0, 1], [count - with_bio, with_bio], width=0.75)
    plt.xticks([0.375, 1.375], ['Without Biography', 'With Biography'])
    plt.grid(True, axis='y')
    plt.xlim((-0.5, 2.25))
    plt.show()
예제 #3
0
def about_biographies_length(corpus, bins, log_y):
    """ Computes an histogram of biography length
    """
    lengths = []
    for doc in load_scraped_items(corpus):
        if len(doc.get('bio') or '') > 5:
            lengths.append(len(doc['bio']))

    width = float(max(lengths)) / bins
    buckets = defaultdict(int)
    for each in lengths:
        buckets[int(each / width)] += 1

    for i in xrange(max(buckets.keys())):
        print '%d - %d: %d' % (i * width, (i + 1) * width - 1, buckets[i])

    try:
        import matplotlib.pyplot as plt
    except ImportError:
        logger.warn('Cannot import matplotlib, skipping chart')
        return

    plt.title('Biography length distribution for %d items' % len(lengths))
    plt.xlabel('Biography length in characters')
    plt.ylabel('Number of items')
    plt.hist(lengths, bins=bins, log=log_y)
    plt.grid(True)
    plt.tight_layout()
    plt.show()
예제 #4
0
def main(corpus, document_key, pos_tag_key, language_code, tagger, outfile,
         tt_home, batch_size):
    """ Perform part-of-speech (POS) tagging over an input corpus.
    """
    if tagger == 'tt':
        pos_tagger = TTPosTagger(language_code, tt_home)
        logger.info(
            "About to perform part-of-speech tagging with TreeTagger ...")
    else:
        pos_tagger = NLTKPosTagger(language_code)
        logger.info(
            "About to perform part-of-speech tagging with NLTK tagger ...")

    corpus = load_scraped_items(corpus)

    total = 0
    for i, tagged_document in enumerate(
            pos_tagger.tag_many(corpus, document_key, pos_tag_key,
                                batch_size)):
        total += 1
        outfile.write(json.dumps(tagged_document) + '\n')
        if (i + 1) % 10000 == 0:
            logger.info('processed %d items', i + 1)

    logger.info("Done, total tagged items: %d" % total)

    return 0
def main(corpus, lemma_to_tokens, language_code, strategy, outfile, processes,
         sentences_key, document_key, match_base_form):
    """ Extract corpus sentences containing at least one token in the given set. """
    corpus = load_scraped_items(corpus)
    updated = extract_sentences(corpus, sentences_key, document_key, language_code,
                                json.load(lemma_to_tokens), strategy, match_base_form, processes)

    for item in updated:
        outfile.write(json.dumps(item) + '\n')
    logger.info("Dumped sentences to '%s'" % outfile.name)
    
    return 0
예제 #6
0
def about_sources(corpus, processes, with_bio):
    """ Items' sources
    """
    def worker(items):
        sources = defaultdict(int)
        for doc in items:
            url = doc.get('url')
            if not url:
                logger.warn('found an item without URL, name: %s, bio: %s',
                            doc.get('name'),
                            doc.get('bio', '')[:100] + ' ...')
                sources['_skipped_'] += 1
                continue
            elif with_bio and len(doc.get('bio') or '') < 5:
                continue

            parsed = urlparse(url)
            if parsed.netloc:
                sources[parsed.netloc] += 1
            else:
                logger.warn('cannot parse URL: %s', url)
                sources['_skipped_'] += 1
        return sources

    aggregated_sources = defaultdict(int)
    corpus = parallel.make_batches(load_scraped_items(corpus), 1000)
    for sources in parallel.map(worker, corpus, processes):
        for k, v in sources.iteritems():
            aggregated_sources[k] += v

    aggregated_sources = sorted(aggregated_sources.items(),
                                key=lambda (_, v): v,
                                reverse=True)
    for source, count in aggregated_sources:
        print source, count

    try:
        import matplotlib.pyplot as plt
    except ImportError:
        logger.warn('Cannot import matplotlib, skipping chart')
        return

    count = sum(c for s, c in aggregated_sources)
    display_sources = filter(lambda (s, v): float(v) / count >= 0.01,
                             aggregated_sources)
    sources, values = map(list, zip(*display_sources))
    sources.append('Rest')
    values.append(count - sum(values))
    plt.pie(values, labels=sources)
    plt.axis('equal')
    plt.show()
예제 #7
0
def main(corpus, lemma_to_tokens, language_code, strategy, outfile, processes,
         sentences_key, document_key, match_base_form):
    """ Extract corpus sentences containing at least one token in the given set. """
    corpus = load_scraped_items(corpus)
    updated = extract_sentences(corpus, sentences_key,
                                document_key, language_code,
                                json.load(lemma_to_tokens), strategy,
                                match_base_form, processes)

    for item in updated:
        outfile.write(json.dumps(item) + '\n')
    logger.info("Dumped sentences to '%s'" % outfile.name)

    return 0
예제 #8
0
def produce_lemma_tokens(pos_tagged_path, pos_tag_key, language):
    """ Extracts a map from lemma to all its tokens

        :param str pos_tagged_path: path of the pos-tagged corpus
        :param str pos_tag_key: where the pos tag data is in each item
        :param language: language of the corpus
        :return: mapping from lemma to tokens
        :rtype: dict
    """
    corpus = load_scraped_items(pos_tagged_path)
    lemma_tokens = defaultdict(set)

    for item in corpus:
        for token, pos, lemma in item.get(pos_tag_key, []):
            if pos.startswith(VERBAL_PREFIXES[language]):
                lemma_tokens[lemma.lower()].add(token.lower())

    return lemma_tokens
예제 #9
0
def preprocess_corpus(corpus_dir, document_key, output_dir, items_per_file,
                      min_length):
    """ Remove items without text documents or whose text document is too short """
    filename = 'corpus-%d.jsonlines'
    count = 0
    current_file = open(os.path.join(output_dir, filename % 0), 'w')

    for item in load_scraped_items(corpus_dir):
        if item.get(document_key) and len(item[document_key]) > min_length:
            count += 1
            if count % items_per_file == 0:
                fname = filename % (count / items_per_file)
                logger.info('processed %d items so far, continuing in %s' %
                            (count, fname))
                current_file.close()
                current_file = open(os.path.join(output_dir, fname), 'w')
            item['id'] = hashlib.sha1(
                item[document_key].encode('utf8')).hexdigest()
            json.dump(item, current_file)
            current_file.write('\n')
예제 #10
0
def main(corpus, document_key, pos_tag_key, language_code, tagger, outfile, tt_home, batch_size):
    """ Perform part-of-speech (POS) tagging over an input corpus.
    """
    if tagger == 'tt':
        pos_tagger = TTPosTagger(language_code, tt_home)
        logger.info("About to perform part-of-speech tagging with TreeTagger ...")
    else:
        pos_tagger = NLTKPosTagger(language_code)
        logger.info("About to perform part-of-speech tagging with NLTK tagger ...")

    corpus = load_scraped_items(corpus)
    
    total = 0
    for i, tagged_document in enumerate(pos_tagger.tag_many(corpus, document_key, pos_tag_key, batch_size)):
        total += 1
        outfile.write(json.dumps(tagged_document) + '\n')
        if (i + 1) % 10000 == 0:
            logger.info('processed %d items', i + 1)
    
    logger.info("Done, total tagged items: %d" % total)
    
    return 0
예제 #11
0
 def __init__(self, corpus_path, pos_tag_key):
     self.tags = self._flatten(item.get(pos_tag_key) for item in load_scraped_items(corpus_path))