def saf_to_text(in_dir, out_dir, mode): create_dirs(out_dir) if mode not in ('word', 'lemma'): raise ValueError( "Unknown mode: {mode}, " "please choose either word or lemma".format(**locals())) in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: saf = json.load(f) s_id = None lines = [] for t in saf['tokens']: if s_id is None: s_id = t['sentence'] sentence = [] elif t['sentence'] != s_id: lines.append(u' '.join(sentence)) sentence = [] s_id = t['sentence'] sentence.append(t[mode]) out_file = out_file_name(out_dir, os.path.basename(fi), ext='txt') with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(u'\n'.join(lines)) f.write(u'\n')
def command(in_dir, out_dir, out_name): """Create a division of the data in train, test and validation sets. The result is stored to a JSON file, so it can be reused. """ # TODO: make seed and percentages options SEED = 4 TEST_PERCENTAGE = 10 VAL_PERCENTAGE = 10 create_dirs(out_dir) in_files = get_files(in_dir) np.random.seed(SEED) np.random.shuffle(in_files) n_test = int(len(in_files) / 100.0 * TEST_PERCENTAGE) n_val = int(len(in_files) / 100.0 * VAL_PERCENTAGE) validation_texts = in_files[0:n_val] test_texts = in_files[n_val:n_val + n_test] train_texts = in_files[n_val + n_test:] division = { 'train': [os.path.basename(t) for t in train_texts], 'val': [os.path.basename(t) for t in validation_texts], 'test': [os.path.basename(t) for t in test_texts] } out_file = os.path.join(out_dir, out_name) with codecs.open(out_file, 'wb', encoding='utf-8') as f: json.dump(division, f, indent=4)
def saf_to_text(in_dir, out_dir, mode): create_dirs(out_dir) if mode not in ('word', 'lemma'): raise ValueError("Unknown mode: {mode}, " "please choose either word or lemma" .format(**locals())) in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: saf = json.load(f) s_id = None lines = [] for t in saf['tokens']: if s_id is None: s_id = t['sentence'] sentence = [] elif t['sentence'] != s_id: lines.append(u' '.join(sentence)) sentence = [] s_id = t['sentence'] sentence.append(t[mode]) out_file = out_file_name(out_dir, os.path.basename(fi), ext='txt') with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(u'\n'.join(lines)) f.write(u'\n')
def check_utf8(in_dir, convert, processes, out_dir): create_dirs(out_dir) in_files = get_files(in_dir) check = partial(check_file, convert=convert, out_dir=out_dir) pool = Pool(processes=processes) pool.map(check, in_files)
def test_get_files(fs): # Uses pyfakefs http://pyfakefs.org fs.create_file('/data/a.txt') fs.create_file('/data/c.txt') fs.create_file('/data/b.txt') result = get_files('/data') assert result == ['/data/a.txt', '/data/b.txt', '/data/c.txt']
def safar_add_metadata(in_dir, in_dir_meta, in_file_meta, out_dir): in_files = get_files(in_dir) metadata_files = {os.path.basename(f): f for f in get_files(in_dir_meta)} doc_id = os.path.splitext(os.path.basename(in_file_meta))[0] out_dir_sub = os.path.join(out_dir, doc_id) if not os.path.exists(out_dir_sub): os.mkdir(out_dir_sub) with open(in_file_meta) as fn: metadata_all = BeautifulSoup(fn, 'xml') for in_file in in_files: metadata_file = metadata_files[os.path.basename(in_file)] with open(metadata_file) as f: metadata = BeautifulSoup(f, 'xml') with codecs.open(in_file, encoding='utf-8') as f: soup = BeautifulSoup(f, 'xml') # Make document with a single root element document = BeautifulSoup('<document></document>', 'xml') md_all = copy.copy(metadata_all.metadata) md = copy.copy(metadata.metadata) # add common meta data for m in md_all.find_all('meta'): md.append(m) document.document.append(md) try: document.document.append(soup.morphology_analysis) except: document.document.append(soup.stemmer_analysis) xml_out = out_file_name(out_dir_sub, in_file) with codecs.open(xml_out, 'wb', encoding='utf-8') as f: if six.PY2: # six.u doesn't work in Python 2 with non-ascii text # See https://pythonhosted.org/six/#six.u f.write(unicode(document)) else: f.write(str(document))
def match_ocr_and_gs(ocr_dir, gs_dir, out_dir): create_dirs(out_dir) ocr_files = {os.path.basename(f): f for f in get_files(ocr_dir)} gs_files = {os.path.basename(f): f for f in get_files(gs_dir)} ocr = set(ocr_files.keys()) gs = set(gs_files.keys()) if len(ocr) == 0: raise ValueError('No ocr files in directory "{}".'.format(ocr_dir)) if len(gs) == 0: raise ValueError('No gs files in directory "{}".'.format(gs_dir)) keep = ocr.intersection(gs) if len(keep) == 0: raise ValueError('No matching ocr and gs files.') for name in keep: copy_file(ocr_files[name], name, out_dir, 'ocr') copy_file(gs_files[name], name, out_dir, 'gs')
def command(in_dir, out_dir, tika_server): create_dirs(out_dir) in_files = get_files(in_dir) for fi in in_files: if tika_server: parsed = parser.from_file(fi, tika_server) else: parsed = parser.from_file(fi) out_file = out_file_name(out_dir, fi, 'txt') with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(parsed['content'])
def freqs(in_dir, out_dir, name): out_file = os.path.join(out_dir, name) create_dirs(out_file) in_files = get_files(in_dir) vectorizer = CountVectorizer(input='filename', tokenizer=split) X = vectorizer.fit_transform(in_files) freqs = np.array(X.sum(axis=0)).squeeze() vocab_df = pd.DataFrame( {'word': vectorizer.get_feature_names(), 'freq': freqs}) vocab_df['rank'] = vocab_df['freq'].rank(method='first', ascending=False) vocab_df = vocab_df.sort('rank') vocab_df.to_csv(out_file, encoding='utf-8', index=False)
def freqs(in_dir, out_dir, name): out_file = os.path.join(out_dir, name) create_dirs(out_file) in_files = get_files(in_dir) vectorizer = CountVectorizer(input='filename', tokenizer=split) X = vectorizer.fit_transform(in_files) freqs = np.array(X.sum(axis=0)).squeeze() vocab_df = pd.DataFrame({ 'word': vectorizer.get_feature_names(), 'freq': freqs }) vocab_df['rank'] = vocab_df['freq'].rank(method='first', ascending=False) vocab_df = vocab_df.sort('rank') vocab_df.to_csv(out_file, encoding='utf-8', index=False)
def frog2saf(in_dir, out_dir): create_dirs(out_dir) in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: lines = f.readlines() lines = [line.strip() for line in lines] saf_data = frog_to_saf(parse_frog(lines)) head, tail = os.path.split(fi) fname = tail.replace(os.path.splitext(tail)[1], '') out_file = os.path.join(out_dir, out_file_name(out_dir, fname, 'json')) with codecs.open(out_file, 'wb', encoding='utf-8') as f: json.dump(saf_data, f, indent=4)
def delete_empty_files(in_dir, out_dir): create_dirs(out_dir) in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: text = f.read() if len(text.strip()) > 0: fname = out_file_name(out_dir, fi) try: shutil.copy2(fi, fname) except shutil.Error: pass else: print('deleting {}'.format(os.path.basename(fi))) if os.path.abspath(in_dir) == os.path.abspath(out_dir): os.remove(fi)
def create_chunked_list(in_dir, size, out_dir, out_name): """Create a division of the input files in chunks. The result is stored to a JSON file. """ create_dirs(out_dir) in_files = get_files(in_dir) chunks = chunk(in_files, size) division = {} for i, files in enumerate(chunks): division[i] = [os.path.basename(f) for f in files] out_file = os.path.join(out_dir, out_name) with codecs.open(out_file, 'wb', encoding='utf-8') as f: json.dump(division, f, indent=4)
def concat_files(in_dir, out_dir): in_files = get_files(in_dir) counts = Counter() for in_file in in_files: parts = os.path.basename(in_file).split(u'_') prefix = u'_'.join(parts[:2]) counts[prefix] += 1 out_file = out_file_name(out_dir, prefix, ext='txt') with codecs.open(in_file, 'r', encoding='utf-8') as fi: text = fi.read() text = text.replace(u'\n', u'') text = text.strip() with codecs.open(out_file, 'a', encoding='utf-8') as fo: fo.write(text) fo.write(u'\n')
def xml_to_text(in_dir, out_dir, tag): create_dirs(out_dir) in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: root = etree.ElementTree().parse(f) if tag is not None: elements = list(root.iter('{*}' + tag)) else: elements = [root] texts = [] for el in elements: texts.append(' '.join( [e.text for e in el.iterdescendants() if e.text is not None])) out_file = out_file_name(out_dir, fi, 'txt') with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write('\n'.join(texts)) f.write('\n')
def nerstats(in_dir, out_dir, name): create_dirs(out_dir) frames = [] in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: saf = json.load(f) data = {} data['word'] = [t['word'] for t in saf['tokens'] if 'ne' in t.keys()] data['ner'] = [t['ne'] for t in saf['tokens'] if 'ne' in t.keys()] data['w_id'] = [t['id'] for t in saf['tokens'] if 'ne' in t.keys()] data['text'] = [os.path.basename(fi) for t in saf['tokens'] if 'ne' in t.keys()] frames.append(pd.DataFrame(data=data)) df = pd.concat(frames, ignore_index=True) df.to_csv(os.path.join(out_dir, name), encoding='utf-8')
def freqs(in_dir, out_dir, name, mode): if mode not in ('word', 'lemma'): raise ValueError( "Unknown mode: {mode}, " "please choose either word or lemma".format(**locals())) output_file = out_file_name(out_dir, name) create_dirs(output_file) in_files = get_files(in_dir) cnt = Counter() for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: saf = json.load(f) for token in saf['tokens']: word = token[mode] pos = token['pos1'] cnt.update({(word, pos): 1}) data = [(word, pos, count) for ((word, pos), count) in cnt.most_common()] vocab_df = pd.DataFrame(data, columns=[mode, 'pos', 'cnt']) vocab_df['rank'] = vocab_df.index + 1 vocab_df.to_csv(output_file, encoding='utf-8', index=False)
def freqs(in_dir, out_dir, name, mode): if mode not in ('word', 'lemma'): raise ValueError("Unknown mode: {mode}, " "please choose either word or lemma" .format(**locals())) output_file = out_file_name(out_dir, name) create_dirs(output_file) in_files = get_files(in_dir) cnt = Counter() for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: saf = json.load(f) for token in saf['tokens']: word = token[mode] pos = token['pos1'] cnt.update({(word, pos): 1}) data = [(word, pos, count) for ((word, pos), count) in cnt.most_common()] vocab_df = pd.DataFrame(data, columns=[mode, 'pos', 'cnt']) vocab_df['rank'] = vocab_df.index + 1 vocab_df.to_csv(output_file, encoding='utf-8', index=False)
def nerstats(in_dir, out_dir, name): create_dirs(out_dir) frames = [] in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: saf = json.load(f) data = {} data['word'] = [t['word'] for t in saf['tokens'] if 'ne' in t.keys()] data['ner'] = [t['ne'] for t in saf['tokens'] if 'ne' in t.keys()] data['w_id'] = [t['id'] for t in saf['tokens'] if 'ne' in t.keys()] data['text'] = [ os.path.basename(fi) for t in saf['tokens'] if 'ne' in t.keys() ] frames.append(pd.DataFrame(data=data)) df = pd.concat(frames, ignore_index=True) df.to_csv(os.path.join(out_dir, name), encoding='utf-8')
def merge_csv(in_dir, out_dir, name): create_dirs(out_dir) in_files = get_files(in_dir) wrote_header = False out_file = out_file_name(out_dir, name) with codecs.open(out_file, 'wb', encoding='utf-8') as fo: for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: lines = f.readlines() if len(lines) > 1: header = lines[0] data = lines[1:] # TODO: check if headers are the same if not wrote_header: fo.write(header) wrote_header = True for line in data: fo.write(line)
def basic_text_statistics(in_dir, meta_out): create_dirs(meta_out) d = {'num_words': [], 'num_sentences': []} text_names = [] in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: text = json.load(f, encoding='utf-8') text_id = os.path.splitext(os.path.basename(fi))[0] text_names.append(text_id) d['num_words'].append(len(text['tokens'])) sentences = [t['sentence'] for t in text['tokens']] num_sentences = len(set(sentences)) d['num_sentences'].append(num_sentences) df = pd.DataFrame(d, index=text_names) df.to_csv(meta_out, encoding='utf-8')
def basic_text_statistics(in_dir, out_dir, name): create_dirs(out_dir) d = {'num_words': [], 'num_sentences': []} text_names = [] in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: text = json.load(f, encoding='utf-8') text_id = os.path.splitext(os.path.basename(fi))[0] text_names.append(text_id) d['num_words'].append(len(text['tokens'])) sentences = [t['sentence'] for t in text['tokens']] num_sentences = len(set(sentences)) d['num_sentences'].append(num_sentences) df = pd.DataFrame(d, index=text_names) meta_out = out_file_name(out_dir, name) df.to_csv(meta_out, encoding='utf-8')
def sac2gs_and_ocr(in_dir, out_dir): result = {} result['gs_de'] = [] result['ocr_de'] = [] result['gs_fr'] = [] result['ocr_fr'] = [] files = {} for i in range(1864, 1900): try: in_files = get_files(os.path.join(in_dir, str(i))) for fi in in_files: language = 'de' typ = 'gs' bn = os.path.basename(fi) if bn.endswith('ocr'): typ = 'ocr' if 'fr' in bn: language = 'fr' with codecs.open(fi, encoding='utf-8') as f: text = f.read() fname = '{}-{}-{}.txt'.format(i, language, typ) out_file = os.path.join(out_dir, fname) create_dirs(out_file) with codecs.open(out_file, 'a', encoding='utf-8') as fo: fo.write(text) if out_file not in files: label = '{}_{}'.format(typ, language) result[label].append(cwl_file(out_file)) files[out_file] = None except OSError: pass stdout_text = click.get_text_stream('stdout') stdout_text.write(json.dumps(result))
def archive2dir(archive, remove_dir_structure, out_dir): if remove_dir_structure: result_dir = os.path.join(out_dir, str(uuid.uuid4())) create_dirs(result_dir) # make temporary directory tempdir = tempfile.mkdtemp() # extract archive to temporary directory patoolib.extract_archive(archive, outdir=tempdir) # copy extracted files to output dir files = get_files(tempdir, recursive=True) for f in files: fo = out_file_name(result_dir, f) # don't copy if it's the same file if os.path.abspath(f) != fo: shutil.copy2(f, fo) # remove temporary directory and its contents shutil.rmtree(tempdir) else: # extract archive to temporary directory patoolib.extract_archive(archive, outdir=out_dir)
def merge_safar_xml(in_dir, out_dir): """Command line tool that merges SAFAR xml files into a single file. """ create_dirs(out_dir) in_files = get_files(in_dir) analysis_tag = 'morphology_analysis' words = [] metadata = b'<metadata></metadata>' markers = {} marker_words = {} if len(in_files) == 0: msg = 'Unable to merge xml files, because the input directory is ' \ 'empty.' raise (ValueError(msg)) else: num_words = 0 fname = os.path.basename(in_files[0]).split('-')[0] xml_out = out_file_name(out_dir, u'{}.xml'.format(fname)) click.echo('Reading xml files') (fd, tmpfile) = tempfile.mkstemp() with codecs.open(tmpfile, 'wb') as words: for i, fi in tqdm.tqdm(enumerate(in_files)): # Check whether we are dealing with a marker m = is_marked(os.path.basename(fi)) if m: mname = os.path.basename(fi).rsplit('-', 1)[0] if i == 0: # check whether the analysis_tag should be stemmer_analysis # and extract the metadata context = etree.iterparse(fi, events=('end', ), tag=('stemmer_analysis', 'metadata')) for event, elem in context: if elem.tag == 'stemmer_analysis': analysis_tag = elem.tag elif elem.tag == 'metadata': metadata = etree.tostring(elem, encoding='utf-8') # Check whether we are dealing with a marker if m: if fname not in markers.keys(): markers[mname] = [] marker_words[mname] = [] # Extract the words context = etree.iterparse(fi, events=('end', ), tag=('word')) for event, elem in context: num_words += 1 elem.attrib['w_id'] = str(num_words) if m: markers[mname].append(str(num_words)) marker_words[mname].append(elem.attrib['value']) # Setting method to html (instead of xml) fixes problems # with writing Arabic characters in the value attribute of # the word element. words.write( etree.tostring(elem, encoding='utf-8', method='html')) # make iteration over context fast and consume less memory # https://www.ibm.com/developerworks/xml/library/x-hiperfparse elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] del context # write the output click.echo('Writing output') with codecs.open(xml_out, 'wb') as f: f.write(b'<?xml version="1.0" encoding="utf-8"?>\n') f.write(b'<document>\n') f.write(metadata) tag = ' <{} total_words="{}">\n'.format(analysis_tag, num_words) f.write(tag.encode('utf-8')) with codecs.open(tmpfile, 'rb') as words_file: for line in tqdm.tqdm(words_file): f.write(line) f.write(' </{}>\n'.format(analysis_tag).encode('utf-8')) f.write(b'<markers>\n') for fname, w_ids in markers.items(): if 'header' in fname: level = fname.rsplit('-', 1)[1] f.write( marker_xml('header', marker_words[fname], w_ids, 'level', level)) else: if 'QQuote' in fname: typ = 'quran' else: typ = 'hadith' f.write( marker_xml('quote', marker_words[fname], w_ids, 'type', typ)) f.write(b'</markers>\n') f.write(b'</document>\n') os.remove(tmpfile)