def stem_terminology_words(apps, schema_editor): Stem = apps.get_model("pootle_word.Stem") Unit = apps.get_model("pootle_store.Unit") units = Unit.objects.filter(state=TRANSLATED) units = units.filter(store__translation_project__project__code="terminology") | units.filter( store__name__startswith="pootle-terminology" ) units = units.values_list("id", "source_f") site_stopwords = stopwords.get().words stems = {} delimiters = re.compile("[\W]+", re.U) for unit, source in units: source_words = set(s.strip().lower() for s in delimiters.split(source) if len(s) > 2) for word in source_words: if not word: continue if word in site_stopwords: continue stemmed = stem(word) stems[stemmed] = stems.get(stemmed, set()) stems[stemmed].add(unit) Stem.objects.bulk_create([Stem(root=st) for st in stems], batch_size=1000) added = dict(Stem.objects.values_list("root", "id")) m2m = Stem.units.through m2m_to_add = [] for stemmed in stems: for unit in stems[stemmed]: m2m_to_add.append(m2m(stem_id=added[stemmed], unit_id=unit)) m2m.objects.bulk_create(m2m_to_add, batch_size=1000)
def stem_terminology_words(apps, schema_editor): Stem = apps.get_model("pootle_word.Stem") Unit = apps.get_model("pootle_store.Unit") units = Unit.objects.filter(state=TRANSLATED) units = ( units.filter(store__translation_project__project__code="terminology") | units.filter(store__name__startswith="pootle-terminology")) units = units.values_list("id", "source_f") site_stopwords = stopwords.get().words stems = {} delimiters = re.compile(u"[\W]+", re.U) for unit, source in units: source_words = set(s.strip().lower() for s in delimiters.split(source) if len(s) > 2) for word in source_words: if not word: continue if word in site_stopwords: continue stemmed = stem(word) stems[stemmed] = stems.get(stemmed, set()) stems[stemmed].add(unit) Stem.objects.bulk_create([Stem(root=st) for st in stems], batch_size=1000) added = dict(Stem.objects.values_list("root", "id")) m2m = Stem.units.through m2m_to_add = [] for stemmed in stems: for unit in stems[stemmed]: m2m_to_add.append(m2m(stem_id=added[stemmed], unit_id=unit)) m2m.objects.bulk_create(m2m_to_add, batch_size=1000)
def test_stopwords(): ttk_path = translate.__path__[0] fpath = (os.path.join(ttk_path, "share", "stoplist-en") if "share" in os.listdir(ttk_path) else os.path.join( ttk_path, "..", "share", "stoplist-en")) words = set() with open(fpath) as f: for line in f.read().split("\n"): if not line: continue if line[0] in "<>=@": words.add(line[1:].strip().lower()) stops = stopwords.get() assert stops.words == words
def test_stopwords(): ttk_path = translate.__path__[0] fpath = ( os.path.join(ttk_path, "share", "stoplist-en") if "share" in os.listdir(ttk_path) else os.path.join(ttk_path, "..", "share", "stoplist-en")) words = set() with open(fpath) as f: for line in f.read().split("\n"): if not line: continue if line[0] in "<>=@": words.add(line[1:].strip().lower()) stops = stopwords.get() assert stops.words == words
def test_unit_terminology_instance(terminology_units, terminology0): units = terminology0.stores.first().units.filter( source_f=terminology_units) unit = None for _unit in units: if _unit.source_f == terminology_units: unit = _unit break term = terminology.get(unit.__class__)(unit) assert isinstance(term, UnitTerminology) assert term.context == unit assert term.stopwords == stopwords.get().words assert term.stemmer == stemmer.get() assert term.text == unit.source_f assert ( term.split(term.text) == re.split(u"[^\w'-]+", term.text)) assert ( term.tokens == [t.lower() for t in term.split(term.text) if (len(t) > 2 and t.lower() not in term.stopwords)]) assert ( term.stems == set(term.stemmer(t) for t in term.tokens)) assert term.stem_set == unit.stems assert term.stem_model == term.stem_set.model assert term.stem_m2m == term.stem_set.through unit.stems.all().delete() assert term.existing_stems == set([]) term.stem() assert sorted(term.existing_stems) == sorted(term.stems) old_source = unit.source_f old_stems = term.existing_stems unit.source_f = "hatstand hatstand umbrella" unit.save() term.stem() assert ( sorted(term.existing_stems) == [u'hatstand', u'umbrella']) unit.source_f = old_source unit.save() term.stem() assert ( term.existing_stems == old_stems)
def stopwords(self): return stopwords.get().words