예제 #1
0
 def startElement(self, name, attributes):
     if name == 'doclist':
         self.in_token_ = False
         self.doclist_ = documents.Doclist()
         pass
     elif name == 'doc':
         self.in_token_ = False
         self.doc_ = documents.Doc()
         pass
     elif name == 'lang':
         self.in_token_ = False
         self.lang_ = tokens.Lang()
         try:
             self.lang_.SetId(attributes['id'])
         except KeyError:
             pass
     elif name == 'token':
         self.token_string_ = ''
         self.in_token_ = True
         try:
             self.count_ = int(attributes['count'])
         except KeyError:
             self.count_ = 1
         try:
             self.morphs_ = attributes['morphs']
         except KeyError:
             self.morphs_ = ''
         try:
             self.prons_ = attributes['prons']
         except KeyError:
             self.prons_ = ''
예제 #2
0
def LoadData():
    t_extr = thai_extractor.ThaiExtractor()
    e_extr = extractor.NameExtractor()
    doclist = documents.Doclist()
    doc = documents.Doc()
    doclist.AddDoc(doc)
    #### Thai
    lang = tokens.Lang()
    lang.SetId('th')
    doc.AddLang(lang)
    t_extr.FileExtract(THAI_)
    lang.SetTokens(t_extr.Tokens())
    lang.CompactTokens()
    for t in lang.Tokens():
        pronouncer_ = pronouncer.UnitranPronouncer(t)
        pronouncer_.Pronounce()
    #### English
    lang = tokens.Lang()
    lang.SetId('en')
    doc.AddLang(lang)
    e_extr.FileExtract(ENGLISH_)
    lang.SetTokens(e_extr.Tokens())
    lang.CompactTokens()
    for t in lang.Tokens():
        pronouncer_ = pronouncer.EnglishPronouncer(t)
        pronouncer_.Pronounce()
    return doclist
예제 #3
0
def LoadData():
    t_extr = chinese_extractor.ChineseExtractor()
    e_extr = extractor.NameExtractor()
    doclist = documents.Doclist()
    doc = documents.Doc()
    doclist.AddDoc(doc)
    #### Chinese
    lang = tokens.Lang()
    lang.SetId('zh')
    doc.AddLang(lang)
    t_extr.FileExtract(CHINESE_)
    lang.SetTokens(t_extr.Tokens())
    lang.CompactTokens()
    for t in lang.Tokens():
        pronouncer_ = pronouncer.HanziPronouncer(t)
        pronouncer_.Pronounce()
    #### English
    lang = tokens.Lang()
    lang.SetId('en')
    doc.AddLang(lang)
    e_extr.FileExtract(ENGLISH_)
    lang.SetTokens(e_extr.Tokens())
    lang.CompactTokens()
    for t in lang.Tokens():
        pronouncer_ = pronouncer.EnglishPronouncer(t)
        pronouncer_.Pronounce()
    return doclist
예제 #4
0
 def __init__(self, doclist=None):
     if doclist is None:
         self.doclist_ = documents.Doclist()
     else:
         self.doclist_ = doclist
     self.n_ = len(self.doclist_.Docs())
     self.tokstats_ = {}
예제 #5
0
def LoadData(filelist,
             base='.',
             extractor_=extractor.NameExtractor,
             xdump=None,
             mincnt=DEF_MINCNT_):
    lastgroup = -1
    lastlanguage = ''
    doc = None
    lang = None
    doclist = documents.Doclist()
    xtractr = extractor_()
    sys.stderr.write('Extracting terms...\n')
    fp = open(filelist)
    for line in fp:
        toks = line.split()
        group = int(toks[0])
        language = toks[1]
        files = toks[2:]
        if group != lastgroup:
            if lastgroup > 0:
                assert group == lastgroup + 1,\
                    'Failed sanity check: group %d != group %d + 1' % (group, lastgroup)
            doc = documents.Doc()
            doclist.AddDoc(doc)
        if language != lastlanguage:
            if lang:
                lang.CompactTokens()
            lang = tokens.Lang()
            lang.SetId(language)
            doc.AddLang(lang)
        for file in files:
            file = base + '/' + file
            xtractr.InitData()
            xtractr.FileExtract(file)
            for t in xtractr.Tokens():
                lang.AddToken(t)
        lastgroup = group
        lastlanguage = language
        #lines = p.readlines()
    fp.close()
    if mincnt > 0:
        sys.stderr.write(
            'Filtering to remove terms less frequent than %d...\n' % mincnt)
        filter_ = filter.FrequencyFilter(doclist)
        filter_.SetMinCount(mincnt)
        filter_.Filter()
    if xdump:
        sys.stderr.write('Dumping doclist to %s...\n' % xdump)
        doclist.XmlDump(xdump, utf8=True)
    return doclist
예제 #6
0
def LoadData():
  mp = open(CHINESE_)
  ep = open(ENGLISH_)
  cp = open(CONFIDENCE_)
  doclist = documents.Doclist()
  while True:
    eline = ep.readline()
    mline = mp.readline()
    cline = cp.readline()
    if not cline: break
    if float(cline.strip()) < MINCONFIDENCE_: continue
    doc = documents.Doc()
    ### Chinese
    extractor_ = chinese_extractor.ChineseExtractor()
    extractor_.InitData()
    extractor_.LineSegment(mline)
    lang = tokens.Lang()
    lang.SetId('zho')
    for t in extractor_.Tokens():
      lang.AddToken(t)
    lang.CompactTokens() ## Combine duplicates
    for t in lang.Tokens():
      pronouncer_ = pronouncer.HanziPronouncer(t)
      pronouncer_.Pronounce()
    doc.AddLang(lang)
    ### English
    extractor_ = extractor.NameExtractor()
    extractor_.InitData()
    extractor_.LineSegment(eline)
    lang = tokens.Lang()
    lang.SetId('eng')
    for t in extractor_.Tokens():
      lang.AddToken(t)
    lang.CompactTokens() ## Combine duplicates
    for t in lang.Tokens():
      pronouncer_ = pronouncer.EnglishPronouncer(t)
      pronouncer_.Pronounce()
      if not t.Pronunciations():
        pronouncer_ = pronouncer.LatinPronouncer(t)
        pronouncer_.Pronounce()
    doc.AddLang(lang)
    doclist.AddDoc(doc)
  mp.close()
  ep.close()
  cp.close()
  return doclist
예제 #7
0
def CreateDoclist():
    doclist = documents.Doclist()
    doc = documents.Doc()
    lang = tokens.Lang()
    lang.SetId('eng')
    token_ = tokens.Token('Bush')
    token_.SetCount(1)
    token_.AddPronunciation('b U S')
    token_.SetMorphs(['Bush', "'s"])
    lang.AddToken(token_)
    token_ = tokens.Token('Clinton')
    token_.SetCount(3)
    token_.AddPronunciation('k l I n t & n')
    token_.AddPronunciation('k l I n t > n')
    token_.SetMorphs(['Clinton'])
    lang.AddToken(token_)
    token_ = tokens.Token('Bush')
    token_.SetCount(3)
    token_.AddPronunciation('b U S')
    token_.SetMorphs([
        'Bush',
        "'s",
    ])
    lang.AddToken(token_)
    lang.CompactTokens()
    doc.AddLang(lang)
    lang = tokens.Lang()
    lang.SetId('zho')
    token_ = tokens.Token('克林頓')
    token_.SetCount(3)
    token_.AddPronunciation('kh & l i n t u n')
    token_.SetMorphs(['克林頓'])
    lang.AddToken(token_)
    token_ = tokens.Token('高島屋')
    token_.SetCount(1)
    token_.AddPronunciation('k a u t a u u')
    token_.AddPronunciation('t A k A s i m A j a')
    lang.AddToken(token_)
    doc.AddLang(lang)
    doclist.AddDoc(doc)
    doc = documents.Doc()
    lang = tokens.Lang()
    lang.SetId('eng')
    token_ = tokens.Token('Clinton')
    token_.SetCount(2)
    token_.AddPronunciation('k l I n t & n')
    token_.SetMorphs(['Clinton'])
    lang.AddToken(token_)
    token_ = tokens.Token('Bush')
    token_.SetCount(3)
    token_.AddPronunciation('b U S')
    token_.SetMorphs(['Bush', "'s"])
    lang.AddToken(token_)
    doc.AddLang(lang)
    lang = tokens.Lang()
    lang.SetId('ara')
    token_ = tokens.Token('كلينتون')
    token_.SetCount(3)
    token_.AddPronunciation('k l j n t w n')
    token_.SetMorphs(['كلينتون'])
    lang.AddToken(token_)
    doc.AddLang(lang)
    doclist.AddDoc(doc)
    return doclist