Пример #1
0
def saf_to_text(in_dir, out_dir, mode):
    create_dirs(out_dir)

    if mode not in ('word', 'lemma'):
        raise ValueError(
            "Unknown mode: {mode}, "
            "please choose either word or lemma".format(**locals()))

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            saf = json.load(f)

        s_id = None
        lines = []

        for t in saf['tokens']:
            if s_id is None:
                s_id = t['sentence']
                sentence = []
            elif t['sentence'] != s_id:
                lines.append(u' '.join(sentence))
                sentence = []
                s_id = t['sentence']

            sentence.append(t[mode])

        out_file = out_file_name(out_dir, os.path.basename(fi), ext='txt')
        with codecs.open(out_file, 'wb', encoding='utf-8') as f:
            f.write(u'\n'.join(lines))
            f.write(u'\n')
Пример #2
0
def command(in_dir, out_dir, out_name):
    """Create a division of the data in train, test and validation sets.

    The result is stored to a JSON file, so it can be reused.
    """
    # TODO: make seed and percentages options
    SEED = 4
    TEST_PERCENTAGE = 10
    VAL_PERCENTAGE = 10

    create_dirs(out_dir)

    in_files = get_files(in_dir)

    np.random.seed(SEED)
    np.random.shuffle(in_files)

    n_test = int(len(in_files) / 100.0 * TEST_PERCENTAGE)
    n_val = int(len(in_files) / 100.0 * VAL_PERCENTAGE)

    validation_texts = in_files[0:n_val]
    test_texts = in_files[n_val:n_val + n_test]
    train_texts = in_files[n_val + n_test:]

    division = {
        'train': [os.path.basename(t) for t in train_texts],
        'val': [os.path.basename(t) for t in validation_texts],
        'test': [os.path.basename(t) for t in test_texts]
    }

    out_file = os.path.join(out_dir, out_name)
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        json.dump(division, f, indent=4)
Пример #3
0
def saf_to_text(in_dir, out_dir, mode):
    create_dirs(out_dir)

    if mode not in ('word', 'lemma'):
        raise ValueError("Unknown mode: {mode}, "
                         "please choose either word or lemma"
                         .format(**locals()))

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            saf = json.load(f)

        s_id = None
        lines = []

        for t in saf['tokens']:
            if s_id is None:
                s_id = t['sentence']
                sentence = []
            elif t['sentence'] != s_id:
                lines.append(u' '.join(sentence))
                sentence = []
                s_id = t['sentence']

            sentence.append(t[mode])

        out_file = out_file_name(out_dir, os.path.basename(fi), ext='txt')
        with codecs.open(out_file, 'wb', encoding='utf-8') as f:
            f.write(u'\n'.join(lines))
            f.write(u'\n')
Пример #4
0
def check_utf8(in_dir, convert, processes, out_dir):
    create_dirs(out_dir)
    in_files = get_files(in_dir)

    check = partial(check_file, convert=convert, out_dir=out_dir)

    pool = Pool(processes=processes)
    pool.map(check, in_files)
Пример #5
0
def test_get_files(fs):
    # Uses pyfakefs http://pyfakefs.org
    fs.create_file('/data/a.txt')
    fs.create_file('/data/c.txt')
    fs.create_file('/data/b.txt')

    result = get_files('/data')

    assert result == ['/data/a.txt', '/data/b.txt', '/data/c.txt']
Пример #6
0
def safar_add_metadata(in_dir, in_dir_meta, in_file_meta, out_dir):
    in_files = get_files(in_dir)
    metadata_files = {os.path.basename(f): f for f in get_files(in_dir_meta)}

    doc_id = os.path.splitext(os.path.basename(in_file_meta))[0]

    out_dir_sub = os.path.join(out_dir, doc_id)
    if not os.path.exists(out_dir_sub):
        os.mkdir(out_dir_sub)

    with open(in_file_meta) as fn:
        metadata_all = BeautifulSoup(fn, 'xml')

    for in_file in in_files:
        metadata_file = metadata_files[os.path.basename(in_file)]
        with open(metadata_file) as f:
            metadata = BeautifulSoup(f, 'xml')
        with codecs.open(in_file, encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'xml')

        # Make document with a single root element
        document = BeautifulSoup('<document></document>', 'xml')
        md_all = copy.copy(metadata_all.metadata)
        md = copy.copy(metadata.metadata)
        # add common meta data
        for m in md_all.find_all('meta'):
            md.append(m)
        document.document.append(md)
        try:
            document.document.append(soup.morphology_analysis)
        except:
            document.document.append(soup.stemmer_analysis)
        xml_out = out_file_name(out_dir_sub, in_file)
        with codecs.open(xml_out, 'wb', encoding='utf-8') as f:
            if six.PY2:
                # six.u doesn't work in Python 2 with non-ascii text
                # See https://pythonhosted.org/six/#six.u
                f.write(unicode(document))
            else:
                f.write(str(document))
Пример #7
0
def match_ocr_and_gs(ocr_dir, gs_dir, out_dir):
    create_dirs(out_dir)

    ocr_files = {os.path.basename(f): f for f in get_files(ocr_dir)}
    gs_files = {os.path.basename(f): f for f in get_files(gs_dir)}

    ocr = set(ocr_files.keys())
    gs = set(gs_files.keys())

    if len(ocr) == 0:
        raise ValueError('No ocr files in directory "{}".'.format(ocr_dir))
    if len(gs) == 0:
        raise ValueError('No gs files in directory "{}".'.format(gs_dir))

    keep = ocr.intersection(gs)

    if len(keep) == 0:
        raise ValueError('No matching ocr and gs files.')

    for name in keep:
        copy_file(ocr_files[name], name, out_dir, 'ocr')
        copy_file(gs_files[name], name, out_dir, 'gs')
Пример #8
0
def command(in_dir, out_dir, tika_server):
    create_dirs(out_dir)

    in_files = get_files(in_dir)

    for fi in in_files:
        if tika_server:
            parsed = parser.from_file(fi, tika_server)
        else:
            parsed = parser.from_file(fi)

        out_file = out_file_name(out_dir, fi, 'txt')
        with codecs.open(out_file, 'wb', encoding='utf-8') as f:
            f.write(parsed['content'])
Пример #9
0
def freqs(in_dir, out_dir, name):
    out_file = os.path.join(out_dir, name)
    create_dirs(out_file)

    in_files = get_files(in_dir)

    vectorizer = CountVectorizer(input='filename', tokenizer=split)
    X = vectorizer.fit_transform(in_files)
    freqs = np.array(X.sum(axis=0)).squeeze()
    vocab_df = pd.DataFrame(
        {'word': vectorizer.get_feature_names(), 'freq': freqs})
    vocab_df['rank'] = vocab_df['freq'].rank(method='first', ascending=False)
    vocab_df = vocab_df.sort('rank')
    vocab_df.to_csv(out_file, encoding='utf-8', index=False)
Пример #10
0
def freqs(in_dir, out_dir, name):
    out_file = os.path.join(out_dir, name)
    create_dirs(out_file)

    in_files = get_files(in_dir)

    vectorizer = CountVectorizer(input='filename', tokenizer=split)
    X = vectorizer.fit_transform(in_files)
    freqs = np.array(X.sum(axis=0)).squeeze()
    vocab_df = pd.DataFrame({
        'word': vectorizer.get_feature_names(),
        'freq': freqs
    })
    vocab_df['rank'] = vocab_df['freq'].rank(method='first', ascending=False)
    vocab_df = vocab_df.sort('rank')
    vocab_df.to_csv(out_file, encoding='utf-8', index=False)
Пример #11
0
def frog2saf(in_dir, out_dir):
    create_dirs(out_dir)

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            lines = f.readlines()
            lines = [line.strip() for line in lines]
        saf_data = frog_to_saf(parse_frog(lines))

        head, tail = os.path.split(fi)
        fname = tail.replace(os.path.splitext(tail)[1], '')

        out_file = os.path.join(out_dir, out_file_name(out_dir, fname, 'json'))
        with codecs.open(out_file, 'wb', encoding='utf-8') as f:
            json.dump(saf_data, f, indent=4)
Пример #12
0
def frog2saf(in_dir, out_dir):
    create_dirs(out_dir)

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            lines = f.readlines()
            lines = [line.strip() for line in lines]
        saf_data = frog_to_saf(parse_frog(lines))

        head, tail = os.path.split(fi)
        fname = tail.replace(os.path.splitext(tail)[1], '')

        out_file = os.path.join(out_dir, out_file_name(out_dir, fname, 'json'))
        with codecs.open(out_file, 'wb', encoding='utf-8') as f:
            json.dump(saf_data, f, indent=4)
Пример #13
0
def delete_empty_files(in_dir, out_dir):
    create_dirs(out_dir)

    in_files = get_files(in_dir)
    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            text = f.read()

        if len(text.strip()) > 0:
            fname = out_file_name(out_dir, fi)
            try:
                shutil.copy2(fi, fname)
            except shutil.Error:
                pass
        else:
            print('deleting {}'.format(os.path.basename(fi)))
            if os.path.abspath(in_dir) == os.path.abspath(out_dir):
                os.remove(fi)
Пример #14
0
def create_chunked_list(in_dir, size, out_dir, out_name):
    """Create a division of the input files in chunks.

    The result is stored to a JSON file.
    """
    create_dirs(out_dir)

    in_files = get_files(in_dir)
    chunks = chunk(in_files, size)

    division = {}

    for i, files in enumerate(chunks):
        division[i] = [os.path.basename(f) for f in files]

    out_file = os.path.join(out_dir, out_name)
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        json.dump(division, f, indent=4)
Пример #15
0
def concat_files(in_dir, out_dir):
    in_files = get_files(in_dir)

    counts = Counter()

    for in_file in in_files:
        parts = os.path.basename(in_file).split(u'_')
        prefix = u'_'.join(parts[:2])
        counts[prefix] += 1

        out_file = out_file_name(out_dir, prefix, ext='txt')

        with codecs.open(in_file, 'r', encoding='utf-8') as fi:
            text = fi.read()
            text = text.replace(u'\n', u'')
            text = text.strip()

        with codecs.open(out_file, 'a', encoding='utf-8') as fo:
            fo.write(text)
            fo.write(u'\n')
Пример #16
0
def xml_to_text(in_dir, out_dir, tag):
    create_dirs(out_dir)

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            root = etree.ElementTree().parse(f)
        if tag is not None:
            elements = list(root.iter('{*}' + tag))
        else:
            elements = [root]
        texts = []
        for el in elements:
            texts.append(' '.join(
                [e.text for e in el.iterdescendants() if e.text is not None]))

        out_file = out_file_name(out_dir, fi, 'txt')
        with codecs.open(out_file, 'wb', encoding='utf-8') as f:
            f.write('\n'.join(texts))
            f.write('\n')
Пример #17
0
def nerstats(in_dir, out_dir, name):
    create_dirs(out_dir)

    frames = []

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            saf = json.load(f)
        data = {}
        data['word'] = [t['word'] for t in saf['tokens'] if 'ne' in t.keys()]
        data['ner'] = [t['ne'] for t in saf['tokens'] if 'ne' in t.keys()]
        data['w_id'] = [t['id'] for t in saf['tokens'] if 'ne' in t.keys()]
        data['text'] = [os.path.basename(fi)
                        for t in saf['tokens'] if 'ne' in t.keys()]

        frames.append(pd.DataFrame(data=data))

    df = pd.concat(frames, ignore_index=True)
    df.to_csv(os.path.join(out_dir, name), encoding='utf-8')
Пример #18
0
def freqs(in_dir, out_dir, name, mode):
    if mode not in ('word', 'lemma'):
        raise ValueError(
            "Unknown mode: {mode}, "
            "please choose either word or lemma".format(**locals()))
    output_file = out_file_name(out_dir, name)
    create_dirs(output_file)

    in_files = get_files(in_dir)

    cnt = Counter()
    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            saf = json.load(f)
        for token in saf['tokens']:
            word = token[mode]
            pos = token['pos1']
            cnt.update({(word, pos): 1})
    data = [(word, pos, count) for ((word, pos), count) in cnt.most_common()]
    vocab_df = pd.DataFrame(data, columns=[mode, 'pos', 'cnt'])
    vocab_df['rank'] = vocab_df.index + 1
    vocab_df.to_csv(output_file, encoding='utf-8', index=False)
Пример #19
0
def freqs(in_dir, out_dir, name, mode):
    if mode not in ('word', 'lemma'):
        raise ValueError("Unknown mode: {mode}, "
                         "please choose either word or lemma"
                         .format(**locals()))
    output_file = out_file_name(out_dir, name)
    create_dirs(output_file)

    in_files = get_files(in_dir)

    cnt = Counter()
    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            saf = json.load(f)
        for token in saf['tokens']:
            word = token[mode]
            pos = token['pos1']
            cnt.update({(word, pos): 1})
    data = [(word, pos, count) for ((word, pos), count) in cnt.most_common()]
    vocab_df = pd.DataFrame(data, columns=[mode, 'pos', 'cnt'])
    vocab_df['rank'] = vocab_df.index + 1
    vocab_df.to_csv(output_file, encoding='utf-8', index=False)
Пример #20
0
def nerstats(in_dir, out_dir, name):
    create_dirs(out_dir)

    frames = []

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            saf = json.load(f)
        data = {}
        data['word'] = [t['word'] for t in saf['tokens'] if 'ne' in t.keys()]
        data['ner'] = [t['ne'] for t in saf['tokens'] if 'ne' in t.keys()]
        data['w_id'] = [t['id'] for t in saf['tokens'] if 'ne' in t.keys()]
        data['text'] = [
            os.path.basename(fi) for t in saf['tokens'] if 'ne' in t.keys()
        ]

        frames.append(pd.DataFrame(data=data))

    df = pd.concat(frames, ignore_index=True)
    df.to_csv(os.path.join(out_dir, name), encoding='utf-8')
Пример #21
0
def merge_csv(in_dir, out_dir, name):
    create_dirs(out_dir)

    in_files = get_files(in_dir)

    wrote_header = False

    out_file = out_file_name(out_dir, name)
    with codecs.open(out_file, 'wb', encoding='utf-8') as fo:
        for fi in in_files:
            with codecs.open(fi, encoding='utf-8') as f:
                lines = f.readlines()
            if len(lines) > 1:
                header = lines[0]
                data = lines[1:]

                # TODO: check if headers are the same
                if not wrote_header:
                    fo.write(header)
                    wrote_header = True
                for line in data:
                    fo.write(line)
Пример #22
0
def basic_text_statistics(in_dir, meta_out):
    create_dirs(meta_out)

    d = {'num_words': [], 'num_sentences': []}

    text_names = []

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            text = json.load(f, encoding='utf-8')

        text_id = os.path.splitext(os.path.basename(fi))[0]
        text_names.append(text_id)
        d['num_words'].append(len(text['tokens']))
        sentences = [t['sentence'] for t in text['tokens']]
        num_sentences = len(set(sentences))
        d['num_sentences'].append(num_sentences)

    df = pd.DataFrame(d, index=text_names)
    df.to_csv(meta_out, encoding='utf-8')
Пример #23
0
def xml_to_text(in_dir, out_dir, tag):
    create_dirs(out_dir)

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            root = etree.ElementTree().parse(f)
        if tag is not None:
            elements = list(root.iter('{*}' + tag))
        else:
            elements = [root]
        texts = []
        for el in elements:
            texts.append(' '.join(
                [e.text for e in el.iterdescendants() if
                    e.text is not None]))

        out_file = out_file_name(out_dir, fi, 'txt')
        with codecs.open(out_file, 'wb', encoding='utf-8') as f:
            f.write('\n'.join(texts))
            f.write('\n')
def basic_text_statistics(in_dir, out_dir, name):
    create_dirs(out_dir)

    d = {'num_words': [], 'num_sentences': []}

    text_names = []

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            text = json.load(f, encoding='utf-8')

        text_id = os.path.splitext(os.path.basename(fi))[0]
        text_names.append(text_id)
        d['num_words'].append(len(text['tokens']))
        sentences = [t['sentence'] for t in text['tokens']]
        num_sentences = len(set(sentences))
        d['num_sentences'].append(num_sentences)

    df = pd.DataFrame(d, index=text_names)
    meta_out = out_file_name(out_dir, name)
    df.to_csv(meta_out, encoding='utf-8')
Пример #25
0
def sac2gs_and_ocr(in_dir, out_dir):
    result = {}
    result['gs_de'] = []
    result['ocr_de'] = []
    result['gs_fr'] = []
    result['ocr_fr'] = []

    files = {}

    for i in range(1864, 1900):
        try:
            in_files = get_files(os.path.join(in_dir, str(i)))
            for fi in in_files:
                language = 'de'
                typ = 'gs'
                bn = os.path.basename(fi)

                if bn.endswith('ocr'):
                    typ = 'ocr'
                if 'fr' in bn:
                    language = 'fr'
                with codecs.open(fi, encoding='utf-8') as f:
                    text = f.read()
                fname = '{}-{}-{}.txt'.format(i, language, typ)
                out_file = os.path.join(out_dir, fname)
                create_dirs(out_file)
                with codecs.open(out_file, 'a', encoding='utf-8') as fo:
                    fo.write(text)
                if out_file not in files:
                    label = '{}_{}'.format(typ, language)
                    result[label].append(cwl_file(out_file))
                    files[out_file] = None
        except OSError:
            pass

    stdout_text = click.get_text_stream('stdout')
    stdout_text.write(json.dumps(result))
Пример #26
0
def archive2dir(archive, remove_dir_structure, out_dir):
    if remove_dir_structure:
        result_dir = os.path.join(out_dir, str(uuid.uuid4()))
        create_dirs(result_dir)

        # make temporary directory
        tempdir = tempfile.mkdtemp()

        # extract archive to temporary directory
        patoolib.extract_archive(archive, outdir=tempdir)

        # copy extracted files to output dir
        files = get_files(tempdir, recursive=True)
        for f in files:
            fo = out_file_name(result_dir, f)
            # don't copy if it's the same file
            if os.path.abspath(f) != fo:
                shutil.copy2(f, fo)

        # remove temporary directory and its contents
        shutil.rmtree(tempdir)
    else:
        # extract archive to temporary directory
        patoolib.extract_archive(archive, outdir=out_dir)
Пример #27
0
def merge_safar_xml(in_dir, out_dir):
    """Command line tool that merges SAFAR xml files into a single file.
    """
    create_dirs(out_dir)

    in_files = get_files(in_dir)

    analysis_tag = 'morphology_analysis'

    words = []
    metadata = b'<metadata></metadata>'
    markers = {}
    marker_words = {}

    if len(in_files) == 0:
        msg = 'Unable to merge xml files, because the input directory is ' \
              'empty.'
        raise (ValueError(msg))
    else:
        num_words = 0

        fname = os.path.basename(in_files[0]).split('-')[0]
        xml_out = out_file_name(out_dir, u'{}.xml'.format(fname))

        click.echo('Reading xml files')
        (fd, tmpfile) = tempfile.mkstemp()
        with codecs.open(tmpfile, 'wb') as words:
            for i, fi in tqdm.tqdm(enumerate(in_files)):
                # Check whether we are dealing with a marker
                m = is_marked(os.path.basename(fi))
                if m:
                    mname = os.path.basename(fi).rsplit('-', 1)[0]

                if i == 0:
                    # check whether the analysis_tag should be stemmer_analysis
                    # and extract the metadata
                    context = etree.iterparse(fi,
                                              events=('end', ),
                                              tag=('stemmer_analysis',
                                                   'metadata'))
                    for event, elem in context:
                        if elem.tag == 'stemmer_analysis':
                            analysis_tag = elem.tag
                        elif elem.tag == 'metadata':
                            metadata = etree.tostring(elem, encoding='utf-8')

                # Check whether we are dealing with a marker
                if m:
                    if fname not in markers.keys():
                        markers[mname] = []
                        marker_words[mname] = []
                # Extract the words
                context = etree.iterparse(fi, events=('end', ), tag=('word'))
                for event, elem in context:
                    num_words += 1
                    elem.attrib['w_id'] = str(num_words)

                    if m:
                        markers[mname].append(str(num_words))
                        marker_words[mname].append(elem.attrib['value'])

                    # Setting method to html (instead of xml) fixes problems
                    # with writing Arabic characters in the value attribute of
                    # the word element.
                    words.write(
                        etree.tostring(elem, encoding='utf-8', method='html'))

                    # make iteration over context fast and consume less memory
                    # https://www.ibm.com/developerworks/xml/library/x-hiperfparse
                    elem.clear()
                    while elem.getprevious() is not None:
                        del elem.getparent()[0]
                del context

        # write the output
        click.echo('Writing output')
        with codecs.open(xml_out, 'wb') as f:
            f.write(b'<?xml version="1.0" encoding="utf-8"?>\n')
            f.write(b'<document>\n')

            f.write(metadata)

            tag = '  <{} total_words="{}">\n'.format(analysis_tag, num_words)
            f.write(tag.encode('utf-8'))

            with codecs.open(tmpfile, 'rb') as words_file:
                for line in tqdm.tqdm(words_file):
                    f.write(line)

            f.write('  </{}>\n'.format(analysis_tag).encode('utf-8'))

            f.write(b'<markers>\n')

            for fname, w_ids in markers.items():
                if 'header' in fname:
                    level = fname.rsplit('-', 1)[1]
                    f.write(
                        marker_xml('header', marker_words[fname], w_ids,
                                   'level', level))
                else:
                    if 'QQuote' in fname:
                        typ = 'quran'
                    else:
                        typ = 'hadith'
                    f.write(
                        marker_xml('quote', marker_words[fname], w_ids, 'type',
                                   typ))

            f.write(b'</markers>\n')

            f.write(b'</document>\n')
        os.remove(tmpfile)