예제 #1
0
 def __init__(
     self, meters_list=None, find_feet=None, meters_filter=None, with_mir=True
 ):
     if not meters_list:
         meters_list = _load_yaml(meters_filename)
         if with_mir:
             mir_meters = _load_yaml(mir_meters_filename)
             meters_list = meters_list + mir_meters
         self._scans_with_feet = _gen_possible_feet(meters_list)
         find_feet = self.find_feet
     if meters_filter:
         meters_list = meters_filter(meters_list)
     Scanner.__init__(
         self,
         GraphTransliterator.from_yaml_file(transcription_filename),
         GraphTransliterator.from_yaml_file(long_parser_filename),
         GraphTransliterator.from_yaml_file(short_parser_filename),
         _load_yaml(constraints_filename),
         meters_list,
         find_feet=find_feet,
         post_scan_filter=filter_scans,
     )
예제 #2
0
def load_transliterator(source, **kwargs):
    """Loads transliterator (format, parameter)."""
    format, parameter = source
    if format == "bundled":
        mod = __import__("graphtransliterator.transliterators")
        transliterators_mod = mod.transliterators
        transliterator_class = getattr(transliterators_mod, parameter)
        return transliterator_class(**kwargs)
    elif format == "json":
        return GraphTransliterator.loads(parameter, **kwargs)
    elif format == "json_file":
        with open(parameter, "r") as f:
            return GraphTransliterator.loads(f.read(), **kwargs)
    elif format == "yaml_file":
        return GraphTransliterator.from_yaml_file(parameter, **kwargs)
    norm_string = re.sub(r"ṭ", "ṭ", norm_string)
    translated = norm_string.translate(
        norm_string.maketrans(normalization_dict))
    norm_string = re.sub(r"-", " ", translated)
    norm_string = re.sub(r'( ){2,}', " ", norm_string)
    return norm_string


df = pd.read_csv(input_file, sep='\t')
df_augmented = df.copy()
df_augmented["normalized"] = df["sentence"] = df.apply(
    lambda row: normalize(row['sentence'], orthography), axis=1)
if orthography in [
        'tifinagh_ahaggar', 'tifinagh_ahaggar_lig', 'tifinagh_ircam', 'arabic'
]:
    gt = GraphTransliterator.from_yaml_file(paths[orthography])
    df_augmented['transliteration'] = df["sentence"] = df_augmented.apply(
        lambda row: gt.transliterate(row['normalized']), axis=1)
df.to_csv(output_folder + '/' + os.path.basename(input_file),
          sep='\t',
          index=False,
          header=True)

input_base = os.path.splitext(os.path.basename(input_file))[0]
with open(output_folder + '/' + input_base + "_compare.txt", "w+") as f:
    for (idx, row) in df_augmented.iterrows():
        f.write(row.sentence + "\n")
        f.write("\t" + row.normalized + "\n")
        if orthography in [
                'tifinagh_ahaggar', 'tifinagh_ahaggar_lig', 'tifinagh_ircam',
                'arabic'
예제 #4
0
def test_GraphTransliterator(tmpdir):
    """Test GraphTransliterator."""
    yaml_str = r"""
    tokens:
      a: [token, class1]
      b: [token, class2]
      u: [token]
      ' ': [wb]
    rules:
      a: A
      b: B
      <wb> u: \N{DEVANAGARI LETTER U}
    onmatch_rules:
      -
        <class1> + <class2>: ","
      -
        <class1> + <token>: \N{DEVANAGARI SIGN VIRAMA}
    whitespace:
      default: ' '
      token_class: 'wb'
      consolidate: true
    metadata:
      author: Author
    """

    input_dict = yaml.safe_load(yaml_str)
    assert "a" in GraphTransliterator.from_easyreading_dict(input_dict).tokens.keys()
    gt = GraphTransliterator.from_easyreading_dict(input_dict)
    assert gt.onmatch_rules[0].production == ","
    assert gt.tokens
    assert gt.rules
    assert gt.whitespace
    assert gt.whitespace.default
    assert gt.whitespace.token_class
    assert gt.whitespace.consolidate
    assert gt.metadata["author"] == "Author"
    assert type(gt.graph) == DirectedGraph
    yaml_file = tmpdir.join("yaml_test.yaml")
    yaml_filename = str(yaml_file)
    yaml_file.write(yaml_str)

    assert yaml_file.read() == yaml_str

    assert GraphTransliterator.from_yaml_file(yaml_filename)

    assert len(set(GraphTransliterator.from_easyreading_dict(input_dict).tokens)) == 4

    assert GraphTransliterator.from_yaml(yaml_str).transliterate("ab") == "A,B"
    assert (
        GraphTransliterator.from_yaml_file(yaml_filename).transliterate("ab") == "A,B"
    )
    assert (
        GraphTransliterator.from_easyreading_dict(
            {
                "tokens": {"a": ["class_a"], "b": ["class_b"], " ": ["wb"]},
                "onmatch_rules": [{"<class_a> + <class_b>": ","}],
                "whitespace": {
                    "default": " ",
                    "token_class": "wb",
                    "consolidate": True,
                },
                "rules": {"a": "A", "b": "B"},
            }
        ).transliterate("ab")
        == "A,B"
    )
    for i in range(len(final) - a_id):
        #print(final[a_id+i])
        final2.append(final[a_id + i])

    return final2


## ConfusionMatrix
confusion_matrix = ConfusionDictionary()
possibilities = set(x for x in confusion_matrix.getPreds())
for x in confusion_matrix.getGolds():
    possibilities.add(x)
import json
from graphtransliterator import GraphTransliterator
gt = GraphTransliterator.from_yaml_file(
    "/Users/mosaix/orthographic-ASR/transliterate/transliterators/latin_prealignment.yml"
)
tf = GraphTransliterator.from_yaml_file(
    "/Users/mosaix/orthographic-ASR/transliterate/transliterators/tifinagh_to_latin.yml"
)
no_lm_store = {}

gold_aligned = []
pred_aligned = []
with open('transliterate/output/latin_norm/no_lm/inferences.json') as f:
    data = json.load(f)
with open('transliterate/output/latin_norm/no_lm/alignments.txt', "w+") as l:
    for i in data:
        try:
            wavfile = i['wav_filename'].split('/')[-1]
            compare_tuple = (gt.transliterate(i['src']),