示例#1
0
def stem_terminology_words(apps, schema_editor):
    Stem = apps.get_model("pootle_word.Stem")
    Unit = apps.get_model("pootle_store.Unit")
    units = Unit.objects.filter(state=TRANSLATED)
    units = units.filter(store__translation_project__project__code="terminology") | units.filter(
        store__name__startswith="pootle-terminology"
    )
    units = units.values_list("id", "source_f")
    site_stopwords = stopwords.get().words
    stems = {}
    delimiters = re.compile("[\W]+", re.U)
    for unit, source in units:
        source_words = set(s.strip().lower() for s in delimiters.split(source) if len(s) > 2)
        for word in source_words:
            if not word:
                continue
            if word in site_stopwords:
                continue
            stemmed = stem(word)
            stems[stemmed] = stems.get(stemmed, set())
            stems[stemmed].add(unit)
    Stem.objects.bulk_create([Stem(root=st) for st in stems], batch_size=1000)
    added = dict(Stem.objects.values_list("root", "id"))
    m2m = Stem.units.through
    m2m_to_add = []
    for stemmed in stems:
        for unit in stems[stemmed]:
            m2m_to_add.append(m2m(stem_id=added[stemmed], unit_id=unit))
    m2m.objects.bulk_create(m2m_to_add, batch_size=1000)
示例#2
0
def stem_terminology_words(apps, schema_editor):
    Stem = apps.get_model("pootle_word.Stem")
    Unit = apps.get_model("pootle_store.Unit")
    units = Unit.objects.filter(state=TRANSLATED)
    units = (
        units.filter(store__translation_project__project__code="terminology")
        | units.filter(store__name__startswith="pootle-terminology"))
    units = units.values_list("id", "source_f")
    site_stopwords = stopwords.get().words
    stems = {}
    delimiters = re.compile(u"[\W]+", re.U)
    for unit, source in units:
        source_words = set(s.strip().lower() for s in delimiters.split(source)
                           if len(s) > 2)
        for word in source_words:
            if not word:
                continue
            if word in site_stopwords:
                continue
            stemmed = stem(word)
            stems[stemmed] = stems.get(stemmed, set())
            stems[stemmed].add(unit)
    Stem.objects.bulk_create([Stem(root=st) for st in stems], batch_size=1000)
    added = dict(Stem.objects.values_list("root", "id"))
    m2m = Stem.units.through
    m2m_to_add = []
    for stemmed in stems:
        for unit in stems[stemmed]:
            m2m_to_add.append(m2m(stem_id=added[stemmed], unit_id=unit))
    m2m.objects.bulk_create(m2m_to_add, batch_size=1000)
示例#3
0
def test_stopwords():
    ttk_path = translate.__path__[0]
    fpath = (os.path.join(ttk_path, "share", "stoplist-en")
             if "share" in os.listdir(ttk_path) else os.path.join(
                 ttk_path, "..", "share", "stoplist-en"))
    words = set()
    with open(fpath) as f:
        for line in f.read().split("\n"):
            if not line:
                continue
            if line[0] in "<>=@":
                words.add(line[1:].strip().lower())
    stops = stopwords.get()
    assert stops.words == words
示例#4
0
def test_stopwords():
    ttk_path = translate.__path__[0]
    fpath = (
        os.path.join(ttk_path, "share", "stoplist-en")
        if "share" in os.listdir(ttk_path)
        else os.path.join(ttk_path, "..", "share", "stoplist-en"))
    words = set()
    with open(fpath) as f:
        for line in f.read().split("\n"):
            if not line:
                continue
            if line[0] in "<>=@":
                words.add(line[1:].strip().lower())
    stops = stopwords.get()
    assert stops.words == words
示例#5
0
文件: utils.py 项目: yiyibooks/pootle
def test_unit_terminology_instance(terminology_units, terminology0):
    units = terminology0.stores.first().units.filter(
        source_f=terminology_units)
    unit = None
    for _unit in units:
        if _unit.source_f == terminology_units:
            unit = _unit
            break
    term = terminology.get(unit.__class__)(unit)
    assert isinstance(term, UnitTerminology)
    assert term.context == unit
    assert term.stopwords == stopwords.get().words
    assert term.stemmer == stemmer.get()
    assert term.text == unit.source_f
    assert (
        term.split(term.text)
        == re.split(u"[^\w'-]+", term.text))
    assert (
        term.tokens
        == [t.lower()
            for t
            in term.split(term.text)
            if (len(t) > 2
                and t.lower() not in term.stopwords)])
    assert (
        term.stems
        == set(term.stemmer(t) for t in term.tokens))
    assert term.stem_set == unit.stems
    assert term.stem_model == term.stem_set.model
    assert term.stem_m2m == term.stem_set.through
    unit.stems.all().delete()
    assert term.existing_stems == set([])
    term.stem()
    assert sorted(term.existing_stems) == sorted(term.stems)
    old_source = unit.source_f
    old_stems = term.existing_stems
    unit.source_f = "hatstand hatstand umbrella"
    unit.save()
    term.stem()
    assert (
        sorted(term.existing_stems)
        == [u'hatstand', u'umbrella'])
    unit.source_f = old_source
    unit.save()
    term.stem()
    assert (
        term.existing_stems
        == old_stems)
示例#6
0
 def stopwords(self):
     return stopwords.get().words
示例#7
0
文件: utils.py 项目: rmoch/pootle
 def stopwords(self):
     return stopwords.get().words