Пример #1
0
def extract_multilingual_documents(inv_dict, langs, text_path, out_path):
    if not os.path.exists(out_path):
        os.makedirs(out_path)
    for lang in langs:
        if lang not in inv_dict:
            raise ValueError("Lang %s is not in the dictionary" % lang)

    docs_created = len(list_files(out_path))
    print("%d multilingual documents found." % docs_created)
    for doc, lang in _doc_generator(text_path, langs):
        title = _extract_title(doc)

        if title in inv_dict[lang]:
            #pass
            ids = inv_dict[lang][title]
            for id in ids:
                target_file = join(out_path, id) + ".xml"
                if os.path.exists(target_file):
                    _append_doc(target_file, doc, lang)
                else:
                    _create_doc(target_file, id, doc, lang)
                    docs_created += 1
        else:
            if not re.match('[A-Za-z]+', title):
                print("Title <%s> for lang <%s> not in dictionary" %
                      (title, lang))
Пример #2
0
def _doc_generator(text_path, langs):
    dotspace = re.compile(r'\.(?!\s)')
    for l, lang in enumerate(langs):
        print("Processing language <%s> (%d/%d)" % (lang, l, len(langs)))
        lang_dir = join(text_path, lang)
        split_dirs = list_dirs(lang_dir)
        for sd, split_dir in enumerate(split_dirs):
            print("\tprocessing split_dir <%s> (%d/%d)" %
                  (split_dir, sd, len(split_dirs)))
            split_files = list_files(join(lang_dir, split_dir))
            for sf, split_file in enumerate(split_files):
                print("\t\tprocessing split_file <%s> (%d/%d)" %
                      (split_file, sf, len(split_files)))
                with BZ2File(join(lang_dir, split_dir, split_file),
                             'r',
                             buffering=1024 * 1024) as fi:
                    while True:
                        doc_lines = list(islice(fi, 3))
                        if doc_lines:
                            # some sentences are not followed by a space after the dot
                            doc_lines[1] = dotspace.sub('. ', doc_lines[1])
                            # [workaround] I found &nbsp; html symbol was not treated, and unescaping it now might not help...
                            doc_lines[1] = escape(doc_lines[1].replace(
                                "&nbsp;", " "))
                            yield doc_lines, lang
                        else:
                            break
Пример #3
0
def fetch_wikipedia_multilingual(wiki_multi_path, langs, min_words=100, deletions=False, max_documents=-1, pickle_name=None):
    if pickle_name and os.path.exists(pickle_name):
        print("unpickling %s" % pickle_name)
        return pickle.load(open(pickle_name, 'rb'))

    multi_docs = list_files(wiki_multi_path)
    mling_documents = {l:[] for l in langs}
    valid_documents = 0
    minwords_exception = 0
    wrongdoc_exception = 0
    for d,multi_doc in enumerate(multi_docs):
        print("\rProcessed %d/%d documents, valid %d/%d, few_words=%d, few_langs=%d" %
              (d, len(multi_docs), valid_documents, len(multi_docs), minwords_exception, wrongdoc_exception),end="")
        doc_path = join(wiki_multi_path, multi_doc)
        try:
            m_doc = _load_multilang_doc(doc_path, langs, min_words)
            valid_documents += 1
            for l in langs:
                mling_documents[l].append(m_doc[l])
        except MinWordsNotReached:
            minwords_exception += 1
            if deletions: os.remove(doc_path)
        except WrongDocumentFormat:
            wrongdoc_exception += 1
            if deletions: os.remove(doc_path)
        if max_documents>0 and valid_documents>=max_documents:
            break

    if pickle_name:
        print("Pickling wikipedia documents object in %s" % pickle_name)
        pickle.dump(mling_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL)

    return mling_documents
Пример #4
0
def fetch_RCV1(data_path, split='all'):

    assert split in ['train', 'test',
                     'all'], 'split should be "train", "test", or "all"'

    request = []
    labels = set()
    read_documents = 0
    lang = 'en'

    training_documents = 23149
    test_documents = 781265

    if split == 'all':
        split_range = (2286, 810596)
        expected = training_documents + test_documents
    elif split == 'train':
        split_range = (2286, 26150)
        expected = training_documents
    else:
        split_range = (26151, 810596)
        expected = test_documents

    global nwords
    nwords = []
    for part in list_files(data_path):
        if not re.match('\d+\.zip', part): continue
        target_file = join(data_path, part)
        assert exists(target_file), \
            "You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\
            " w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information."
        zipfile = ZipFile(target_file)
        for xmlfile in zipfile.namelist():
            xmlcontent = zipfile.open(xmlfile).read()
            try:
                doc = parse_document(xmlcontent,
                                     assert_lang=lang,
                                     valid_id_range=split_range)
                labels.update(doc.categories)
                request.append(doc)
                read_documents += 1
            except ValueError:
                print(
                    '\n\tskipping document {} with inconsistent language label: expected language {}'
                    .format(part + '/' + xmlfile, lang))
            except (IDRangeException, ExpectedLanguageException) as e:
                pass
            print('\r[{}] read {} documents'.format(part, len(request)),
                  end='')
            if read_documents == expected: break
        if read_documents == expected: break
    print()
    print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords),
                                               np.min(nwords), np.max(nwords)))
    return request, list(labels)
Пример #5
0
def fetch_RCV2(data_path, languages=None):

    if not languages:
        languages = list(RCV2_LANG_DIR.keys())
    else:
        assert set(languages).issubset(set(
            RCV2_LANG_DIR.keys())), 'languages not in scope'

    request = []
    labels = set()
    global nwords
    nwords = []
    for lang in languages:
        path = join(data_path, RCV2_LANG_DIR[lang])
        lang_docs_read = 0
        for part in list_files(path):
            target_file = join(path, part)
            assert exists(target_file), \
                "You don't seem to have the file "+part+" in " + path + ", and the RCV2 corpus can not be downloaded"+\
                " w/o a formal permission. Please, refer to " + RCV2_BASE_URL + " for more information."
            zipfile = ZipFile(target_file)
            for xmlfile in zipfile.namelist():
                xmlcontent = zipfile.open(xmlfile).read()
                try:
                    doc = parse_document(xmlcontent, assert_lang=lang)
                    labels.update(doc.categories)
                    request.append(doc)
                    lang_docs_read += 1
                except ValueError:
                    print(
                        '\n\tskipping document {} with inconsistent language label: expected language {}'
                        .format(
                            RCV2_LANG_DIR[lang] + '/' + part + '/' + xmlfile,
                            lang))
                except (IDRangeException, ExpectedLanguageException) as e:
                    pass
                print('\r[{}] read {} documents, {} for language {}'.format(
                    RCV2_LANG_DIR[lang] + '/' + part, len(request),
                    lang_docs_read, lang),
                      end='')
        print()
    print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords),
                                               np.min(nwords), np.max(nwords)))
    return request, list(labels)
Пример #6
0
def fetch_jrcacquis(
        langs=None,
        data_path=None,
        years=None,
        ignore_unclassified=True,
        cat_filter=None,
        cat_threshold=0,
        parallel=None,
        most_frequent=-1,
        DOWNLOAD_URL_BASE='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'
):

    assert parallel in [None, 'force', 'avoid'], 'parallel mode not supported'
    if not langs:
        langs = JRC_LANGS
    else:
        if isinstance(langs, str): langs = [langs]
        for l in langs:
            if l not in JRC_LANGS:
                raise ValueError(
                    'Language %s is not among the valid languages in JRC-Acquis v3'
                    % l)

    if not data_path:
        data_path = get_data_home()

    if not os.path.exists(data_path):
        os.mkdir(data_path)

    request = []
    total_read = 0
    for l in langs:
        file_name = 'jrc-' + l + '.tgz'
        archive_path = join(data_path, file_name)

        if not os.path.exists(archive_path):
            print(
                "downloading language-specific dataset (once and for all) into %s"
                % data_path)
            DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name)
            download_file(DOWNLOAD_URL, archive_path)
            print("untarring dataset...")
            tarfile.open(archive_path, 'r:gz').extractall(data_path)

        documents_dir = join(data_path, l)

        print("Reading documents...")
        read = 0
        for dir in list_dirs(documents_dir):
            year = int(dir)
            if years == None or year in years:
                year_dir = join(documents_dir, dir)
                pickle_name = join(data_path,
                                   'jrc_' + l + '_' + dir + '.pickle')
                if os.path.exists(pickle_name):
                    print("loading from file %s" % pickle_name)
                    l_y_documents = pickle.load(open(pickle_name, "rb"))
                    read += len(l_y_documents)
                else:
                    l_y_documents = []
                    all_documents = list_files(year_dir)
                    empty = 0
                    for i, doc_file in enumerate(all_documents):
                        try:
                            jrc_doc = parse_document(join(year_dir, doc_file),
                                                     year)
                        except ValueError:
                            jrc_doc = None

                        if jrc_doc and (not ignore_unclassified
                                        or jrc_doc.categories):
                            l_y_documents.append(jrc_doc)
                        else:
                            empty += 1
                        if len(all_documents) > 50 and (
                            (i + 1) % (len(all_documents) / 50) == 0):
                            print('\r\tfrom %s: completed %d%%' %
                                  (year_dir,
                                   int((i + 1) * 100.0 / len(all_documents))),
                                  end='')
                        read += 1
                    print(
                        '\r\tfrom %s: completed 100%% read %d documents (discarded %d without categories or empty fields)\n'
                        % (year_dir, i + 1, empty),
                        end='')
                    print("\t\t(Pickling object for future runs in %s)" %
                          pickle_name)
                    pickle.dump(l_y_documents, open(pickle_name, 'wb'),
                                pickle.HIGHEST_PROTOCOL)
                request += l_y_documents
        print("Read %d documents for language %s\n" % (read, l))
        total_read += read
    print("Read %d documents in total" % (total_read))

    if parallel == 'force':
        request = _force_parallel(request, langs)
    elif parallel == 'avoid':
        request = random_sampling_avoiding_parallel(request)

    final_cats = _get_categories(request)

    if cat_filter:
        request = _filter_by_category(request, cat_filter)
        final_cats = _get_categories(request)
    if cat_threshold > 0:
        request, final_cats = _filter_by_frequency(request, cat_threshold)
    if most_frequent != -1 and len(final_cats) > most_frequent:
        request, final_cats = _most_common(request, most_frequent)

    return request, final_cats