示例#1
0
from nltk import sent_tokenize
from functions import _read, write_lines
# English
lines = _read("global.en")

new = []

for line in lines:
    new += sent_tokenize(line)

write_lines(new, "global.en")

# Nepali
lines = _read("global.ne")

import re

new = []

for line in lines:
    new += re.split("ред\s", line)

write_lines(new, "global.ne")
示例#2
0
from functions import _read, write_lines, xml_to_text,\
 text_to_docx, check_repetition_pair

filename = "mono"

# write_lines(xml_to_text(filename), filename + ".ne")

write_lines(xml_to_text("mono"), "mono.txt")

# lines = check_repetition(_read("a.ne"))
示例#3
0
tmx_file = "NP8"

tree = etree.parse(tmx_file + ".tmx")
paragraphs = tree.findall(".//seg")

lines = []

en,ne = [],[]
counter = 1

for paragraph in paragraphs:
	line = ''.join(etree.tostring(paragraph, method="text", 
		encoding="unicode").split("\n")).strip()

	lines.append(line)

	if counter % 2 == 1:
		en.append(line)
	else:
		ne.append(line)

	counter += 1

summarize(lines)
summarize(en)
summarize(ne)

write_lines(en, tmx_file + ".en")
write_lines(ne, tmx_file + ".ne")

示例#4
0
from functions import _read, write_lines, xml_to_text, summarize, length_filter
import os

directory = "mono/"

lines = []

for file in os.listdir(directory):
    lines += xml_to_text(directory + os.path.splitext(file)[0])

lines = length_filter(lines, 25)

print("Total: ")
summarize(lines)

write_lines(lines, "e-h.ne")
示例#5
0
while i < len(a):
    if re.match("^([a-z0-9])+[^0-9i\.\)]", a[i]):
        a[i - 1] = a[i - 1].strip() + ' ' + a[i].strip()
        del (a[i])
    else:
        i += 1

# Joins a numeral line to the next line
i = 0
while i < len(a) - 1:
    if len(a[i]) < 3 and re.match("^([a-z0-9]){1,2}[\.\)]\s*", a[i]):
        a[i] = a[i].strip() + ' ' + a[i + 1].strip()
        del (a[i + 1])
    i += 1

write_lines(a, "1_bpf.en")

# For Nepali

# Removes lines with only purnabiraams
i = 0
while i < len(b):
    if re.match("^\ред", b[i]):
        del (b[i])
    i += 1

# Joins a numeral line to the next line
i = 0
while i < len(b) - 1:
    if len(b[i]) < 3 and re.match("^([a-z0-9]){1,2}[\.\)]\s*", b[i]):
        b[i] = b[i].strip() + ' ' + b[i + 1].strip()
示例#6
0
# sents = set()

# for (en, ne) in pairs:
# 	try:
# 		if len(ne.split()) > 3 and detect(ne) in ('hi', 'ne'):
# 			sents.add((en, ne))
# 	except Exception:
# 		pass

scores = []

sents = []

for (en, ne) in zip(eng, nep):
    score = length_similarity(en, ne)
    try:
        if score > 0.53:
            sents.append((en, ne))
    except TypeError:
        pass

print(len(scores))

g1, g2 = [], []

for (en, ne) in sents:
    g1.append(en)
    g2.append(ne)

write_lines(g1, "gnome_final.en")
write_lines(g2, "gnome_final.ne")
示例#7
0
# write_lines(g2, "PR_improved.ne")

# REMOVING REPETITIONS IN A SET

A, B = _read("globalvoices_improved.en"), _read("globalvoices_improved.ne")
lines = set(zip(A, B))

_A = []
final = set()
repetitions = []

count = 0

for (en, ne) in lines:
    if en not in _A:
        _A.append(en)
        final.add((en, ne))
    else:
        repetitions.append(en)
    count += 1

g1, g2 = [], []
# print(len(repetitions))
for (en, ne) in final:
    g1.append(en)
    g2.append(ne)

g1, g2 = check_repetition_pair(A, B)

write_lines(g1, "global.en")
write_lines(g2, "global.ne")
示例#8
0
from functions import _read, write_lines, remove_blank_lines

file = "mono"

lines = _read(file + ".en")

write_lines(remove_blank_lines(lines), file + ".en")