Пример #1
0
  def __init__(self, toupdate=True):
    WALS_URL = "http://wals.info/languoid.tab?sEcho=1&iSortingCols=1"+\
            "&iSortCol_0=0&sSortDir_0=asc"
    WALS_TXT = currentdirectory()+"/data/wals/wals.txt"
                
    wals_tsv = sync_and_read(WALS_URL, WALS_TXT, toupdate=toupdate)
    headerline, _ , data = wals_tsv.partition('\n')
    
    for line in data.split('\n'):
      lang = line.split()[0]
      for key, value in zip(headerline.split('\t')[1:], line.split('\t')[1:]):
        self.setdefault(lang,{})[key] = value

    self.GENUS = defaultdict(list)
    for lang in self:
      self.GENUS[self[lang]['genus']].append(lang)

    self.LANGUAGEFAMILY = defaultdict(list)
    for lang in self:
      self.LANGUAGEFAMILY[self[lang]['family']].append(lang)
    
    self.RELATED_LANGS = defaultdict(list)
    for lang in self:
      self.RELATED_LANGS[lang] = self.GENUS[self[lang]['genus']] + \
                                self.LANGUAGEFAMILY[self[lang]['family']]
Пример #2
0
def languages():
  """Returns the number of languages available from original data source."""
  languages = []
  conversions = {"JPN":"jpn", "MAC":"mkd", "qgk":"grc"}
  for i in tarfile.open(currentdirectory()+'/data/odin/odin-full.tar'):
    lang = str(i.name).partition('.')[0]
    if len(lang) != 3: continue
    lang = conversions[lang] if lang in conversions else lang
    languages.append(lang)
  return languages
Пример #3
0
def source_sents(intarfile=currentdirectory()+'/data/odin/odin-all.tar'):
  """ Yield sentences from ODIN tarball. """
  for infile in sorted(read_tarfile(intarfile)):
    language = infile.split('/')[-1].split('-')[1].split('.')[0].split('_')[0]
    conversions = {"JPN":"jpn", "MAC":"mkd", "qgk":"grc"}
    language = conversions[language] if language in conversions else language
    with codecs.open(infile,'r','utf8') as fin:
      for line in fin.readlines():
        sentence = line.strip().split('\t')[0]
        yield language, sentence
Пример #4
0
def phrases(intarfile=currentdirectory()+'/data/omniglot/omniglotphrases.tar', \
            onlysource=False):
  """ Yield source and tranlsation sentences from the clean Omniglot tarball. """
  for infile in read_tarfile(intarfile):
    language = infile.split('/')[-1].split('-')[1].split('.')[0].split('_')[0]
    with codecs.open(infile,'r','utf8') as fin:
      for line in fin.readlines():
        sentence, translation = line.strip().split('\t')
        if onlysource and sentence:
          yield language, sentence.strip()
        else:
          yield language, sentence, translation
Пример #5
0
def documents(intarfile=currentdirectory()+'/data/udhr/udhr-unicode.tar', \
              bysentence=False):
  """ Yields UDHR by documents. """
  for infile in read_tarfile(intarfile):
    #language = infile.split('/')[-1][:3]
    language = infile.split('/')[-1].split('-')[1].split('.')[0].split('_')[0]
    with codecs.open(infile,'r','utf8') as fin:
      if bysentence:
        for sentence in fin.readlines():
          if sentence:
            yield language, sentence.strip()
      else:
        yield language, fin.read()
Пример #6
0
def load_odin_pickle(ODIN_PICKLE=currentdirectory()+'/data/odin/odin-docs.pk'):
  """
  Loads odin-docs.pk and yield one IGT at a time.
  
  >>> for lang, igts in load_odin_pickle():
  >>>   for igt in igts:
  >>>     print lang, igt
  """
  # If odin-docs.pk is not available create it.
  if not os.path.exists(ODIN_PICKLE):
    odindocs = get_odin_igts()
    # Outputs the odin igts examples into '../data/odin/odin-docs.pk'.
    with codecs.open(ODIN_PICKLE,'wb') as fout:
      pickle.dump(odindocs, fout)  
      
  # Loads the pickled file.
  with codecs.open(ODIN_PICKLE,'rb') as fin2: 
    docs = pickle.load(fin2)
    for lang in docs:
      # the data might be too much for the RAM, so yield instead of return.
      yield (lang, docs[lang])
Пример #7
0
def get_odin_igts(ODINFILE=currentdirectory()+'/data/odin/odin-full.tar'):
  """
  Extracts the examples from the ODIN igts and returns a defaultdict(list),
  where the keys are the lang iso codes and values are the examples.
  
  >>> igts = get_odin_igts()
  >>> for lang in igts:
  >>>  for igt in igts[lang]:
  >>>    print lang, igt
  """
  
  tar = tarfile.open(ODINFILE)
  docs = defaultdict(list)
  for infile in tar:
    if '.xml' in infile.name: # there's a rogue file in the tar that is not xml.
      lang = infile.name[:-4].lower()
      ##print lang
      # Find the <igt>...</igt> in the xml.
      odinfile = tar.extractfile(infile).read()
      igts = bs(odinfile).findAll('igt')
      citations = bs(odinfile).findAll('citation')
      for igt, cite in zip(igts, citations):        
        # Find the <example>...</example> in the igt.
        examples = bs(unicode(igt)).findAll('example')
        cite = remove_tags(unicode(cite)).strip(' &lt;/p&gt;')
        for eg in examples:
          try:
            # Only use triplets lines and assumes that
            # line1: src, line2:eng, line3:gloss
            src, eng, gloss = bs(unicode(eg)).findAll('line')
            src, eng, gloss, cite = map(unicode, [src, eng, gloss, cite])
            docs[lang].append((src, eng, gloss, cite))
            ##print src, eng, gloss, cite
          except:
            raise; print eg
  return docs
Пример #8
0
def languages():
  """ Returns the number of languages available from original data source. """
  return [str(i.name).partition('-')[2].partition('.')[0] 
          for i in tarfile.open(currentdirectory()+ \
                                '/data/omniglot/omniglotphrases.tar') \
          if i.name != ""]
Пример #9
0
def source_sents(intarfile=currentdirectory()+\
                 '/data/omniglot/omniglotphrases.tar', onlysource=True):
  """ Yield clean sentences from the clean Omniglot tarball. """
  return phrases(intarfile, onlysource)
Пример #10
0
# -*- coding: utf-8 -*-

import codecs, re
from collections import defaultdict
from utils import sync_and_read, currentdirectory

# Link to the ISO 639-3 file.
ISO6393_URL = "http://www-01.sil.org/iso639-3/iso-639-3.tab"
ISO6393_TXT = currentdirectory()+"/data/sil/iso6393.txt"
# Link to the ISO 639-3 names file.
ISO6393_NAME_URL = "http://www-01.sil.org/iso639-3/iso-639-3_Name_Index.tab"
ISO6393_NAME_TXT = currentdirectory()+"/data/sil/iso6393-name.txt" # a local copy.
# Scope of language, http://www-01.sil.org/iso639-3/scope.asp
# Type of language, see http://www-01.sil.org/iso639-3/types.asp
# See http://www-01.sil.org/iso639-3/iso-639-3.tab
scopetype = {"I":"Indvidual", "M":"Macrolanguage",
               "L":"Living", "E":"Extinct", "A":"Ancient", 
               "H":"Historic", "C":"Constructed"}
# Link to ISO 639-3 Macrolanguages file.
MACROLANGS_URL = "http://www-01.sil.org/iso639-3/iso-639-3-macrolanguages.tab"
MACROLANGS_TXT = currentdirectory()+"/data/sil/marcolangs.txt" # a local copy.

# Link to the ISO 639-3 retirement file. 
RETIRED_URL = "http://www-01.sil.org/iso639-3/iso-639-3_Retirements.tab"
RETIRED_TXT = currentdirectory()+"/data/sil/retired.txt" # a local copy.

class MiniSIL:
  def __init__(self, toupdate=True):
    
    self.ISO6393, self.MARCOLANGS = {} , defaultdict(list)
    self.update(toupdate)
Пример #11
0
def languages():
  """ Returns a list of available languages from original data source. """
  langs = [i.partition('-')[2].partition('.')[0] for i in \
           enumerate_udhr(intarfile=currentdirectory()+ \
                          '/data/udhr/udhr-unicode.tar')]
  return langs
Пример #12
0
def source_sents(intarfile=currentdirectory()+'/data/udhr/udhr-unicode.tar', \
                 bysentence=True):
  return sents(intarfile, bysentence)