def getSalientSets(self, lang, mxSetSize=1, AFreqP=0.20, OFreqP=0.0625):
        sets = dict()
        words = self.text.replace('\n',' ')
        words = words.replace('\t',' ')
        words = words.replace('.', '')
        words = words.replace(',', '')
        words = words.replace('/', '')
        words = words.replace('(', '')
        words = words.replace(')', '')
        words = words.replace(';', '')
        words = words.replace('\"', '')
        words = words.replace('?', '')
        words = words.replace('!', '')
        words = words.replace('[', '')
        words = words.replace(']', '')
        words = words.split(' ')
        setlen = 0
        l = LanguageInfoModel_Mongo()
        linfo = l.getLanguage(self.language_model.lang)

        AFreq = linfo["articleCount"]*AFreqP
        OFreq = linfo["maxFreq"]*OFreqP

        ret = []
        for w in self.genWordSets(mxSetSize):
            wdata = self.language_model.getWord(w[0])
            if wdata != None:
                wAFreq = wdata.articleCount()
                wOFreq = wdata.getFreq()
                articleSaliencyScore = 1 - float(self.words[w[0]].freq)/float(len(self.words.keys()))
                if wOFreq < OFreq and wAFreq < AFreq:
                    ret.append([w[0], articleSaliencyScore])

        return [ret, self]
def generate_model(lang, sites, mxParse=-1, mxSetSize=3):
    model = LanguageModel(lang)
    mongo = LanguageModel_Mongo("", lang, None)
    parsed = 0

    articleDB = ArticleDB()
    while (parsed < mxParse or (mxParse == -1 and parsed < articleDB.count())):
        a = articleDB.get(index=parsed)
        txt = ""  #' '.join(a.get('text',''))
        adate = ' '.join(a.get('time', ''))
        url = ""  #''.join(a.get('url',''))
        atitle = ""

        if isinstance(a.get('url', []), list):
            url = ' '.join(a.get('url', ''))
        elif isinstance(a.get('url', ""), basestring):
            url = a.get('url', "")
        if isinstance(a.get('text', []), list):
            txt = ' '.join(a.get('text', ''))
        elif isinstance(a.get('text', ""), basestring):
            txt = a.get('text', "")
        if isinstance(a.get('title', []), list):
            atitle = ' '.join(a.get('title', ''))
        elif isinstance(a.get('title', ""), basestring):
            atitle = a.get('title', "")
        for s in sites:
            if s in url:
                a = Article(text=txt,
                            title=atitle,
                            src=url,
                            date=adate,
                            nid=a['_id'],
                            language_model=model)
                a.analyze(mxSetSize)

        parsed += 1

    print "Parsed ", parsed, " Articles. Inserting into Database"
    mongo.collection.drop()
    for k, w in model.words.iteritems():
        mongo.__process_word__(w)

    #Update Language Info
    langInfo = LanguageInfoModel_Mongo()

    keys = sorted(model.words.keys())
    freq = model.getWordsByFrequency()

    langInfo.updateLanguage(lang, parsed, len(model.words.keys()),
                            sorted(freq.keys())[len(freq) - 1], sites)

    return mongo
def generate_model(lang, sites, mxParse=-1, mxSetSize=3):
    model = LanguageModel(lang)
    mongo = LanguageModel_Mongo("", lang, None)
    parsed = 0

    articleDB = ArticleDB()
    while (parsed < mxParse or (mxParse == -1 and parsed < articleDB.count())):
        a = articleDB.get(index=parsed)
        txt = ""#' '.join(a.get('text',''))
        adate = ' '.join(a.get('time',''))
        url = ""#''.join(a.get('url',''))
        atitle = ""

        if isinstance(a.get('url', []), list):
            url = ' '.join(a.get('url',''))
        elif isinstance(a.get('url', ""), basestring):
            url = a.get('url', "")
        if isinstance(a.get('text', []), list):
            txt = ' '.join(a.get('text',''))
        elif isinstance(a.get('text', ""), basestring):
            txt = a.get('text', "")
        if isinstance(a.get('title', []), list):
            atitle = ' '.join(a.get('title',''))
        elif isinstance(a.get('title', ""), basestring):
            atitle = a.get('title', "")
        for s in sites:
            if s in url:
                a = Article(text=txt, title=atitle, src=url, date=adate, nid=a['_id'], language_model=model)
                a.analyze(mxSetSize)


        parsed += 1

    print "Parsed ", parsed, " Articles. Inserting into Database"
    mongo.collection.drop()
    for k, w in model.words.iteritems():
        mongo.__process_word__(w)

    #Update Language Info
    langInfo = LanguageInfoModel_Mongo()

    keys = sorted(model.words.keys())
    freq = model.getWordsByFrequency()

    langInfo.updateLanguage(lang, parsed, len(model.words.keys()), sorted(freq.keys())[len(freq)-1], sites)

    return mongo