Exemplo n.º 1
0
def pattern_parse(in_file, language, out_dir):
    if language == 'en':
        from pattern.en import parsetree
    elif language == 'es':
        from pattern.es import parsetree
    elif language == 'de':
        from pattern.de import parsetree
    elif language == 'fr':
        from pattern.fr import parsetree
    elif language == 'it':
        from pattern.it import parsetree
    elif language == 'nl':
        from pattern.nl import parsetree

    tokens = parse(in_file.read(), parsetree)

    pattern_version = pattern.__version__
    header = {
        'format':
        'SAF',
        'format-version':
        '0.1',
        'processed': [{
            'module': "pattern.{}".format(language),
            'module-version': pattern_version,
            'started': datetime.date.today().strftime('%Y-%m-%d')
        }]
    }

    out_file = out_file_name(out_dir, in_file.name, 'json')
    create_dirs(out_file, is_file=True)
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        json.dump({'header': header, 'tokens': tokens}, f, indent=4)
Exemplo n.º 2
0
def create_word_mappings(saf, alignments, lowercase, out_dir):
    create_dirs(out_dir)

    alignment_data = json.load(alignments)
    aligned1 = alignment_data['gs']
    aligned2 = alignment_data['ocr']

    saf = json.load(saf)
    if lowercase:
        words = [w['word'].lower() for w in saf['tokens']]

        aligned1 = [c.lower() for c in aligned1]
        aligned2 = [c.lower() for c in aligned2]
    else:
        words = [w['word'] for w in saf['tokens']]

    wb = find_word_boundaries(words, aligned1)

    doc_id = remove_ext(alignments.name)

    res = {'gs': [], 'ocr': [], 'doc_id': []}
    for s, e in wb:
        w1 = u''.join(aligned1[s:e])
        w2 = u''.join(aligned2[s:e])

        res['gs'].append(w1.strip())
        res['ocr'].append(w2.strip())
        res['doc_id'].append(doc_id)

    # Use pandas DataFrame to create the csv, so commas and quotes are properly
    # escaped.
    df = pd.DataFrame(res)

    out_file = out_file_name(out_dir, doc_id, ext='csv')
    df.to_csv(out_file, encoding='utf-8')
Exemplo n.º 3
0
def command(in_dir, out_dir, out_name):
    """Create a division of the data in train, test and validation sets.

    The result is stored to a JSON file, so it can be reused.
    """
    # TODO: make seed and percentages options
    SEED = 4
    TEST_PERCENTAGE = 10
    VAL_PERCENTAGE = 10

    create_dirs(out_dir)

    in_files = get_files(in_dir)

    np.random.seed(SEED)
    np.random.shuffle(in_files)

    n_test = int(len(in_files) / 100.0 * TEST_PERCENTAGE)
    n_val = int(len(in_files) / 100.0 * VAL_PERCENTAGE)

    validation_texts = in_files[0:n_val]
    test_texts = in_files[n_val:n_val + n_test]
    train_texts = in_files[n_val + n_test:]

    division = {
        'train': [os.path.basename(t) for t in train_texts],
        'val': [os.path.basename(t) for t in validation_texts],
        'test': [os.path.basename(t) for t in test_texts]
    }

    out_file = os.path.join(out_dir, out_name)
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        json.dump(division, f, indent=4)
Exemplo n.º 4
0
def saf_to_text(in_dir, out_dir, mode):
    create_dirs(out_dir)

    if mode not in ('word', 'lemma'):
        raise ValueError(
            "Unknown mode: {mode}, "
            "please choose either word or lemma".format(**locals()))

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            saf = json.load(f)

        s_id = None
        lines = []

        for t in saf['tokens']:
            if s_id is None:
                s_id = t['sentence']
                sentence = []
            elif t['sentence'] != s_id:
                lines.append(u' '.join(sentence))
                sentence = []
                s_id = t['sentence']

            sentence.append(t[mode])

        out_file = out_file_name(out_dir, os.path.basename(fi), ext='txt')
        with codecs.open(out_file, 'wb', encoding='utf-8') as f:
            f.write(u'\n'.join(lines))
            f.write(u'\n')
Exemplo n.º 5
0
def rmgarbage(in_file, out_dir):
    create_dirs(out_dir)

    text = in_file.read()
    words = text.split()

    doc_id = os.path.basename(in_file.name).split('.')[0]

    result = []
    removed = []

    for word in words:
        errors = get_rmgarbage_errors(word)

        if len(errors) == 0:
            result.append(word)
        else:
            removed.append({
                'word': word,
                'errors': u''.join(errors),
                'doc_id': doc_id
            })

    out_file = out_file_name(out_dir, in_file.name)
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        f.write(u' '.join(result))

    metadata_out = pd.DataFrame(removed)
    fname = '{}-rmgarbage-metadata.csv'.format(doc_id)
    out_file = out_file_name(out_dir, fname)
    metadata_out.to_csv(out_file, encoding='utf-8')
Exemplo n.º 6
0
def saf_to_text(in_dir, out_dir, mode):
    create_dirs(out_dir)

    if mode not in ('word', 'lemma'):
        raise ValueError("Unknown mode: {mode}, "
                         "please choose either word or lemma"
                         .format(**locals()))

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            saf = json.load(f)

        s_id = None
        lines = []

        for t in saf['tokens']:
            if s_id is None:
                s_id = t['sentence']
                sentence = []
            elif t['sentence'] != s_id:
                lines.append(u' '.join(sentence))
                sentence = []
                s_id = t['sentence']

            sentence.append(t[mode])

        out_file = out_file_name(out_dir, os.path.basename(fi), ext='txt')
        with codecs.open(out_file, 'wb', encoding='utf-8') as f:
            f.write(u'\n'.join(lines))
            f.write(u'\n')
Exemplo n.º 7
0
def command(ner_statistics, keep, name, out_dir):
    df = pd.read_csv(ner_statistics, index_col=0, encoding='utf-8')

    df = df.query(u' or '.join([u'ner=="{}"'.format(k) for k in keep]))

    output_file = os.path.join(out_dir, name)
    create_dirs(output_file)
    df.to_csv(output_file, encoding='utf-8')
Exemplo n.º 8
0
def check_utf8(in_dir, convert, processes, out_dir):
    create_dirs(out_dir)
    in_files = get_files(in_dir)

    check = partial(check_file, convert=convert, out_dir=out_dir)

    pool = Pool(processes=processes)
    pool.map(check, in_files)
Exemplo n.º 9
0
def lowercase(in_file, out_dir):
    create_dirs(out_dir)

    text = in_file.read()
    text = text.lower()

    stdout_text = click.get_text_stream('stdout')
    stdout_text.write(text)
Exemplo n.º 10
0
def command(ner_statistics, keep, name, out_dir):
    df = pd.read_csv(ner_statistics, index_col=0, encoding='utf-8')

    df = df.query(u' or '.join([u'ner=="{}"'.format(k) for k in keep]))

    output_file = os.path.join(out_dir, name)
    create_dirs(output_file)
    df.to_csv(output_file, encoding='utf-8')
Exemplo n.º 11
0
def prettify_xml(in_file, out_dir):
    create_dirs(out_dir)

    bs = BeautifulSoup(in_file.read(), 'xml')

    out_file = out_file_name(out_dir, in_file.name, 'xml')
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        f.write(bs.prettify())
Exemplo n.º 12
0
def command(in_dir, datadivision, name, out_dir):
    create_dirs(out_dir)

    div = json.load(datadivision)
    files_out = [cwl_file(f) for f in get_files(in_dir, div, name)]

    stdout_text = click.get_text_stream('stdout')
    stdout_text.write(json.dumps({'out_files': files_out}))
Exemplo n.º 13
0
def ocrevaluation_extract(in_file, out_dir):
    create_dirs(out_dir)

    tables = []

    write = False

    (fd, tmpfile) = tempfile.mkstemp()
    with codecs.open(tmpfile, 'w', encoding='utf-8') as tmp:
        for line in in_file:
            if line.startswith('<h2>General'):
                write = True
            if line.startswith('<h2>Difference'):
                write = False
            if line.startswith('<h2>Error'):
                write = True

            if write:
                tmp.write(line)

    with codecs.open(tmpfile, encoding='utf-8') as f:
        soup = BeautifulSoup(f.read(), 'lxml')

    tables = soup.find_all('table')
    assert len(tables) == 2
    os.remove(tmpfile)

    doc = remove_ext(in_file.name)

    t = tables[0]
    table_data = [[cell.text for cell in row('td')] for row in t('tr')]

    # 'transpose' table_data
    lines = {}
    for data in table_data:
        for i, entry in enumerate(data):
            if i not in lines.keys():
                # add doc id to data line (but not to header)
                if i != 0:
                    lines[i] = [doc]
                else:
                    lines[i] = ['']
            lines[i].append(entry)

    out_file = os.path.join(out_dir, '{}-global.csv'.format(doc))
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        for i in range(len(lines.keys())):
            f.write(u','.join(lines[i]))
            f.write(u'\n')

    t = tables[1]
    table_data = [[cell.text for cell in row('td')] for row in t('tr')]
    out_file = os.path.join(out_dir, '{}-character.csv'.format(doc))
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        for data in table_data:
            f.write(u'"{}",'.format(data[0]))
            f.write(u','.join(data[1:]))
            f.write(u'\n')
def normalize_whitespace_punctuation(txt, out_dir):
    create_dirs(out_dir)

    text = txt.read()
    text = normalize_whitespace(text)
    text = normalize_punctuation(text)

    out_file = out_file_name(out_dir, os.path.basename(txt.name))
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        f.write(text)
Exemplo n.º 15
0
def command(xml_file, element, out_dir):
    create_dirs(out_dir)

    bs = BeautifulSoup(xml_file.read(), 'xml')

    for elem in element:
        to_empty = bs.find_all(elem)
        for t in to_empty:
            t.decompose()

    out_file = out_file_name(out_dir, os.path.basename(xml_file.name))
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        f.write(bs.prettify())
Exemplo n.º 16
0
def remove_newlines(in_file, replacement, out_dir):
    create_dirs(out_dir)

    text = in_file.read()

    if replacement == u'space':
        text = re.sub('\n+', u' ', text)
    else:
        text = text.replace(u'\n', u'')
    text = text.strip()

    stdout_text = click.get_text_stream('stdout')
    stdout_text.write(text)
Exemplo n.º 17
0
def command(in_file, rename, out_dir):
    create_dirs(out_dir)

    ext = os.path.splitext(in_file)[1].replace('.', '')
    fname = os.path.basename(in_file)

    if rename == 'spaces':
        fname = fname.replace(' ', '-')
    elif rename == 'random':
        fname = '{}.{}'.format(uuid.uuid4(), ext)

    fo = out_file_name(out_dir, fname)
    shutil.copy2(in_file, fo)
def command(xml_file, element, out_dir):
    create_dirs(out_dir)

    bs = BeautifulSoup(xml_file.read(), 'xml')

    for elem in element:
        to_empty = bs.find_all(elem)
        for t in to_empty:
            t.clear()

    out_file = out_file_name(out_dir, os.path.basename(xml_file.name))
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        f.write(bs.prettify())
Exemplo n.º 19
0
def remove_newlines(in_file, replacement, out_dir):
    create_dirs(out_dir)

    text = in_file.read()

    if replacement == u'space':
        text = re.sub('\n+', u' ', text)
    else:
        text = text.replace(u'\n', u'')
    text = text.strip()

    stdout_text = click.get_text_stream('stdout')
    stdout_text.write(text)
Exemplo n.º 20
0
def command(in_dir, out_dir, tika_server):
    create_dirs(out_dir)

    in_files = get_files(in_dir)

    for fi in in_files:
        if tika_server:
            parsed = parser.from_file(fi, tika_server)
        else:
            parsed = parser.from_file(fi)

        out_file = out_file_name(out_dir, fi, 'txt')
        with codecs.open(out_file, 'wb', encoding='utf-8') as f:
            f.write(parsed['content'])
Exemplo n.º 21
0
def freqs(in_dir, out_dir, name):
    out_file = os.path.join(out_dir, name)
    create_dirs(out_file)

    in_files = get_files(in_dir)

    vectorizer = CountVectorizer(input='filename', tokenizer=split)
    X = vectorizer.fit_transform(in_files)
    freqs = np.array(X.sum(axis=0)).squeeze()
    vocab_df = pd.DataFrame(
        {'word': vectorizer.get_feature_names(), 'freq': freqs})
    vocab_df['rank'] = vocab_df['freq'].rank(method='first', ascending=False)
    vocab_df = vocab_df.sort('rank')
    vocab_df.to_csv(out_file, encoding='utf-8', index=False)
Exemplo n.º 22
0
def freqs(in_dir, out_dir, name):
    out_file = os.path.join(out_dir, name)
    create_dirs(out_file)

    in_files = get_files(in_dir)

    vectorizer = CountVectorizer(input='filename', tokenizer=split)
    X = vectorizer.fit_transform(in_files)
    freqs = np.array(X.sum(axis=0)).squeeze()
    vocab_df = pd.DataFrame({
        'word': vectorizer.get_feature_names(),
        'freq': freqs
    })
    vocab_df['rank'] = vocab_df['freq'].rank(method='first', ascending=False)
    vocab_df = vocab_df.sort('rank')
    vocab_df.to_csv(out_file, encoding='utf-8', index=False)
Exemplo n.º 23
0
def command(ocr_text, gs_text, metadata, out_dir):
    create_dirs(out_dir)

    ocr = ocr_text.read()
    gs = gs_text.read()
    md = json.load(metadata)

    check = True
    # Too many strange characters, so disable sanity check
    if len(set(ocr + gs)) > 127:
        check = False

    ocr_a, gs_a = align_characters(ocr, gs, md['cigar'], sanity_check=check)

    out_file = out_file_name(out_dir, md['doc_id'], 'json')
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        json.dump({'ocr': ocr_a, 'gs': gs_a}, f, encoding='utf-8', indent=4)
Exemplo n.º 24
0
def frog2saf(in_dir, out_dir):
    create_dirs(out_dir)

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            lines = f.readlines()
            lines = [line.strip() for line in lines]
        saf_data = frog_to_saf(parse_frog(lines))

        head, tail = os.path.split(fi)
        fname = tail.replace(os.path.splitext(tail)[1], '')

        out_file = os.path.join(out_dir, out_file_name(out_dir, fname, 'json'))
        with codecs.open(out_file, 'wb', encoding='utf-8') as f:
            json.dump(saf_data, f, indent=4)
Exemplo n.º 25
0
def frog2saf(in_dir, out_dir):
    create_dirs(out_dir)

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            lines = f.readlines()
            lines = [line.strip() for line in lines]
        saf_data = frog_to_saf(parse_frog(lines))

        head, tail = os.path.split(fi)
        fname = tail.replace(os.path.splitext(tail)[1], '')

        out_file = os.path.join(out_dir, out_file_name(out_dir, fname, 'json'))
        with codecs.open(out_file, 'wb', encoding='utf-8') as f:
            json.dump(saf_data, f, indent=4)
Exemplo n.º 26
0
def command(meta_in, meta_out):
    create_dirs(meta_out)

    cgn_tags = pd.read_csv(meta_in, index_col=0, encoding='utf-8')

    result = {}

    for tag in cgn_tags.index:
        click.echo(tag)
        for cgn, uni in tag_mapping.iteritems():
            if tag.startswith(cgn):
                result[tag] = {'pos': uni}

    click.echo(result)

    with codecs.open(meta_out, 'wb', encoding='utf-8') as f:
        json.dump(result, f, indent=4, encoding='utf-8')
Exemplo n.º 27
0
def delete_empty_files(in_dir, out_dir):
    create_dirs(out_dir)

    in_files = get_files(in_dir)
    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            text = f.read()

        if len(text.strip()) > 0:
            fname = out_file_name(out_dir, fi)
            try:
                shutil.copy2(fi, fname)
            except shutil.Error:
                pass
        else:
            print('deleting {}'.format(os.path.basename(fi)))
            if os.path.abspath(in_dir) == os.path.abspath(out_dir):
                os.remove(fi)
Exemplo n.º 28
0
def copy_cwl_files(from_dir=CWL_PATH, to_dir=None):
    """Copy cwl files to a directory where the cwl-runner can find them.

    Args:
        from_dir (str): Path to directory where to copy files from (default:
            the cwl directory of nlppln).
        to_dir (str): Path to directory where the files should be copied to
            (e.g., the CWL working directory).
    """
    cwl_files = glob.glob('{}{}*.cwl'.format(from_dir, os.sep))
    # if no files are found, the output directory should not be created
    if len(cwl_files) > 0:
        create_dirs(to_dir)
    for fi in cwl_files:
        fo = os.path.join(to_dir, os.path.basename(fi))
        shutil.copy2(fi, fo)

    return len(cwl_files)
Exemplo n.º 29
0
def copy_cwl_files(from_dir=CWL_PATH, to_dir=None):
    """Copy cwl files to a directory where the cwl-runner can find them.

    Args:
        from_dir (str): Path to directory where to copy files from (default:
            the cwl directory of nlppln).
        to_dir (str): Path to directory where the files should be copied to
            (e.g., the CWL working directory).
    """
    cwl_files = glob.glob('{}{}*.cwl'.format(from_dir, os.sep))
    # if no files are found, the output directory should not be created
    if len(cwl_files) > 0:
        create_dirs(to_dir)
    for fi in cwl_files:
        fo = os.path.join(to_dir, os.path.basename(fi))
        shutil.copy2(fi, fo)

    return len(cwl_files)
Exemplo n.º 30
0
def create_chunked_list(in_dir, size, out_dir, out_name):
    """Create a division of the input files in chunks.

    The result is stored to a JSON file.
    """
    create_dirs(out_dir)

    in_files = get_files(in_dir)
    chunks = chunk(in_files, size)

    division = {}

    for i, files in enumerate(chunks):
        division[i] = [os.path.basename(f) for f in files]

    out_file = os.path.join(out_dir, out_name)
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        json.dump(division, f, indent=4)
Exemplo n.º 31
0
def clin2018st_extract_text(json_file, out_dir):
    create_dirs(out_dir)

    corrections = {}
    gs_text = []
    text_with_errors = []

    text = json.load(json_file)
    for w in text['corrections']:
        span = w['span']
        # TODO: fix 'after'
        if 'after' in w.keys():
            print('Found "after" in {}.'.format(
                os.path.basename(json_file.name)))
        for i, w_id in enumerate(span):
            corrections[w_id] = {}
            if i == 0:
                corrections[w_id]['text'] = w['text']
            else:
                corrections[w_id]['text'] = u''
            corrections[w_id]['last'] = False
            if i == (len(span) - 1):
                corrections[w_id]['last'] = True

    for w in text['words']:
        w_id = w['id']
        gs_text.append(w['text'])
        if w_id in corrections.keys():
            text_with_errors.append(corrections[w_id]['text'])
        else:
            text_with_errors.append(w['text'])
        if w['space']:
            gs_text.append(u' ')
            text_with_errors.append(u' ')

    gs_file = remove_ext(json_file.name)
    gs_file = os.path.join(out_dir, '{}-gs.txt'.format(gs_file))
    with codecs.open(gs_file, 'wb', encoding='utf-8') as f:
        f.write(u''.join(gs_text))

    err_file = remove_ext(json_file.name)
    err_file = os.path.join(out_dir, '{}-errors.txt'.format(err_file))
    with codecs.open(err_file, 'wb', encoding='utf-8') as f:
        f.write(u''.join(text_with_errors))
def ocrevaluation_extract(in_file, out_dir):
    create_dirs(out_dir)

    soup = BeautifulSoup(in_file, 'lxml')
    tables = []
    for header in soup.find_all('h2'):
        if (header.text == 'General results'
                or header.text.startswith('Error rate')):
            tables.append(header.find_next('table'))

    assert len(tables) == 2

    doc = remove_ext(in_file.name)

    t = tables[0]
    table_data = [[cell.text for cell in row('td')] for row in t('tr')]

    # 'transpose' table_data
    lines = {}
    for data in table_data:
        for i, entry in enumerate(data):
            if i not in lines.keys():
                # add doc id to data line (but not to header)
                if i != 0:
                    lines[i] = [doc]
                else:
                    lines[i] = ['doc_id']
            lines[i].append(entry.replace(',', '.'))

    out_file = os.path.join(out_dir, '{}-global.csv'.format(doc))
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        for i in range(len(lines.keys())):
            f.write(u';'.join(lines[i]))
            f.write(u'\n')

    t = tables[1]
    table_data = [[cell.text.replace(',', '.') for cell in row('td')]
                  for row in t('tr')]
    out_file = os.path.join(out_dir, '{}-character.csv'.format(doc))
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        for data in table_data:
            f.write(u'"{}";'.format(data[0]))
            f.write(u';'.join(data[1:]))
            f.write(u'\n')
def merge2openiti(in_file1, in_file2, out_dir):
    create_dirs(out_dir)

    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')

    lines1 = in_file1.readlines()
    lines2 = in_file2.readlines()

    merged = []
    for l1, l2 in zip(lines1[:10], lines2[:10]):
        merged_sentence = merge_sentences(l1, l2)
        merged.append(merged_sentence)

    out_file = out_file_name(out_dir, in_file1.name)
    print out_file
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        f.write(''.join(merged))
Exemplo n.º 34
0
def nerstats(in_dir, out_dir, name):
    create_dirs(out_dir)

    frames = []

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            saf = json.load(f)
        data = {}
        data['word'] = [t['word'] for t in saf['tokens'] if 'ne' in t.keys()]
        data['ner'] = [t['ne'] for t in saf['tokens'] if 'ne' in t.keys()]
        data['w_id'] = [t['id'] for t in saf['tokens'] if 'ne' in t.keys()]
        data['text'] = [os.path.basename(fi)
                        for t in saf['tokens'] if 'ne' in t.keys()]

        frames.append(pd.DataFrame(data=data))

    df = pd.concat(frames, ignore_index=True)
    df.to_csv(os.path.join(out_dir, name), encoding='utf-8')
Exemplo n.º 35
0
def xml_to_text(in_dir, out_dir, tag):
    create_dirs(out_dir)

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            root = etree.ElementTree().parse(f)
        if tag is not None:
            elements = list(root.iter('{*}' + tag))
        else:
            elements = [root]
        texts = []
        for el in elements:
            texts.append(' '.join(
                [e.text for e in el.iterdescendants() if e.text is not None]))

        out_file = out_file_name(out_dir, fi, 'txt')
        with codecs.open(out_file, 'wb', encoding='utf-8') as f:
            f.write('\n'.join(texts))
            f.write('\n')
Exemplo n.º 36
0
def command(in_file, out_dir):
    create_dirs(out_dir)

    lines = in_file.readlines()
    # OCR_toInput: lines[0][:14]
    # OCR_aligned: lines[1][:14]
    # GS_aligned: lines[2][:14]
    ocr = to_character_list(lines[1][14:].strip())
    gs = to_character_list(lines[2][14:].strip())

    # Write texts
    out_file = out_file_name(os.path.join(out_dir, 'ocr'),
                             os.path.basename(in_file.name))
    print out_file
    create_dirs(out_file)
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        f.write(u''.join(ocr))

    out_file = out_file_name(os.path.join(out_dir, 'gs'),
                             os.path.basename(in_file.name))
    print out_file
    create_dirs(out_file)
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        f.write(u''.join(gs))

    out_file = out_file_name(out_dir, os.path.basename(in_file.name), 'json')
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        json.dump({'ocr': ocr, 'gs': gs}, f, encoding='utf-8', indent=4)
Exemplo n.º 37
0
def ocrevaluation_extract(in_file, out_dir):
    create_dirs(out_dir)

    soup = BeautifulSoup(in_file, 'lxml')
    tables = soup.find_all('table')
    assert len(tables) == 3

    doc = remove_ext(in_file.name)

    # global measures: table[0]
    t = tables[0]
    table_data = [[cell.text for cell in row('td')] for row in t('tr')]

    # 'transpose' table_data
    lines = {}
    for data in table_data:
        for i, entry in enumerate(data):
            if i not in lines.keys():
                # add doc id to data line (but not to header)
                if i != 0:
                    lines[i] = [doc]
                else:
                    lines[i] = ['']
            lines[i].append(entry)

    out_file = os.path.join(out_dir, '{}-global.csv'.format(doc))
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        for i in range(len(lines.keys())):
            f.write(u','.join(lines[i]))
            f.write(u'\n')

    # character measures: table[2]
    t = tables[2]
    table_data = [[cell.text for cell in row('td')] for row in t('tr')]
    out_file = os.path.join(out_dir, '{}-character.csv'.format(doc))
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        for data in table_data:
            f.write(u'"{}",'.format(data[0]))
            f.write(u','.join(data[1:]))
            f.write(u'\n')
Exemplo n.º 38
0
def freqs(in_dir, out_dir, name, mode):
    if mode not in ('word', 'lemma'):
        raise ValueError("Unknown mode: {mode}, "
                         "please choose either word or lemma"
                         .format(**locals()))
    output_file = out_file_name(out_dir, name)
    create_dirs(output_file)

    in_files = get_files(in_dir)

    cnt = Counter()
    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            saf = json.load(f)
        for token in saf['tokens']:
            word = token[mode]
            pos = token['pos1']
            cnt.update({(word, pos): 1})
    data = [(word, pos, count) for ((word, pos), count) in cnt.most_common()]
    vocab_df = pd.DataFrame(data, columns=[mode, 'pos', 'cnt'])
    vocab_df['rank'] = vocab_df.index + 1
    vocab_df.to_csv(output_file, encoding='utf-8', index=False)
Exemplo n.º 39
0
def match_ocr_and_gs(ocr_dir, gs_dir, out_dir):
    create_dirs(out_dir)

    ocr_files = {os.path.basename(f): f for f in get_files(ocr_dir)}
    gs_files = {os.path.basename(f): f for f in get_files(gs_dir)}

    ocr = set(ocr_files.keys())
    gs = set(gs_files.keys())

    if len(ocr) == 0:
        raise ValueError('No ocr files in directory "{}".'.format(ocr_dir))
    if len(gs) == 0:
        raise ValueError('No gs files in directory "{}".'.format(gs_dir))

    keep = ocr.intersection(gs)

    if len(keep) == 0:
        raise ValueError('No matching ocr and gs files.')

    for name in keep:
        copy_file(ocr_files[name], name, out_dir, 'ocr')
        copy_file(gs_files[name], name, out_dir, 'gs')
Exemplo n.º 40
0
def xml_to_text(in_dir, out_dir, tag):
    create_dirs(out_dir)

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            root = etree.ElementTree().parse(f)
        if tag is not None:
            elements = list(root.iter('{*}' + tag))
        else:
            elements = [root]
        texts = []
        for el in elements:
            texts.append(' '.join(
                [e.text for e in el.iterdescendants() if
                    e.text is not None]))

        out_file = out_file_name(out_dir, fi, 'txt')
        with codecs.open(out_file, 'wb', encoding='utf-8') as f:
            f.write('\n'.join(texts))
            f.write('\n')
def basic_text_statistics(in_dir, out_dir, name):
    create_dirs(out_dir)

    d = {'num_words': [], 'num_sentences': []}

    text_names = []

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            text = json.load(f, encoding='utf-8')

        text_id = os.path.splitext(os.path.basename(fi))[0]
        text_names.append(text_id)
        d['num_words'].append(len(text['tokens']))
        sentences = [t['sentence'] for t in text['tokens']]
        num_sentences = len(set(sentences))
        d['num_sentences'].append(num_sentences)

    df = pd.DataFrame(d, index=text_names)
    meta_out = out_file_name(out_dir, name)
    df.to_csv(meta_out, encoding='utf-8')
Exemplo n.º 42
0
def archive2dir(archive, remove_dir_structure, out_dir):
    if remove_dir_structure:
        result_dir = os.path.join(out_dir, str(uuid.uuid4()))
        create_dirs(result_dir)

        # make temporary directory
        tempdir = tempfile.mkdtemp()

        # extract archive to temporary directory
        patoolib.extract_archive(archive, outdir=tempdir)

        # copy extracted files to output dir
        files = get_files(tempdir, recursive=True)
        for f in files:
            fo = out_file_name(result_dir, f)
            # don't copy if it's the same file
            if os.path.abspath(f) != fo:
                shutil.copy2(f, fo)

        # remove temporary directory and its contents
        shutil.rmtree(tempdir)
    else:
        # extract archive to temporary directory
        patoolib.extract_archive(archive, outdir=out_dir)
Exemplo n.º 43
0
def test_create_dirs_with_file_name(fs):
    # Uses pyfakefs http://pyfakefs.org
    create_dirs('/test/test/test.txt', is_file=True)
    assert os.path.exists('/test/test/')
    assert not os.path.exists('/test/test/test.txt')
Exemplo n.º 44
0
def test_create_dirs_with_dir_name(fs):
    # Uses pyfakefs http://pyfakefs.org
    create_dirs('/test/test/')
    assert os.path.exists('/test/test/')