示例#1
0
 def __init__(self, lang, use_stemmer, stop_words):
     self.lang = lang
     self.use_stemmer = use_stemmer
     self.word_tokenizer = WordTokenizer(locale=lang)
     # As we have only one language currently, no need to
     # check if supported
     self.stemmer = Stemmer.Stemmer(self.lang)
     self.stop_words = stop_words
示例#2
0
class Preprocess:
    def __init__(self, lang, use_stemmer, stop_words):
        self.lang = lang
        self.use_stemmer = use_stemmer
        self.word_tokenizer = WordTokenizer(locale=lang)
        # As we have only one language currently, no need to
        # check if supported
        self.stemmer = Stemmer.Stemmer(self.lang)
        self.stop_words = stop_words

    def stem(self, tokens):
        return (self.stemmer.stemWords(tokens)
                if self.use_stemmer
                else tokens)

    def remove_accents(self, text):
        return (unicodedata.normalize('NFD', text)
                .encode('ascii', 'ignore')
                .decode("utf-8"))

    def lower(self, text):
        return text.lower()

    def clear_html(self, text):
        # Html cleaner will throw exception when an incorrect tag
        # formation is detected a 'prompt' like symbol e.g.: '<- '
        try:
            text = clean_html(html.fromstring(text)).text_content()
        except Exception as e:
            pass
        return text

    def is_token_stopword(self, token):
        return token in self.stop_words

    def is_alpha(self, token):
        return token.isalpha()

    def filtertokens(self, tokens):
        return filter(
            lambda t: self.is_alpha(t) and not (self.is_token_stopword(t)),
            tokens)

    def transform2words(self, text):
        return self.word_tokenizer.transform(Sequence(text)).tokens()

    def process(self, text):
        fnlist = [
            self.remove_accents,
            self.lower,
            self.clear_html,
            self.transform2words,
            self.filtertokens,
            self.stem
        ]
        return compose(*reversed(fnlist))(text)
示例#3
0
def main(input_fname, output_fname, lang, to_lower=True):
    en_tokenizer = WordTokenizer(locale='en')
    fr_tokenizer = WordTokenizer(locale=lang)

    def tokenizer(text, tokenizer_fn):
        seq = Sequence(text.strip())
        return filter(lambda w: w != ' ', tokenizer_fn.transform(seq))

    logging.info((lang, "counting pairs"))
    counter = Counter()
    for line_no, line in enumerate(io.open(input_fname, 'r',
                                           encoding='utf-8')):
        if to_lower:
            line = line.lower()
        parts = line.rstrip().split(' ||| ')
        if len(parts) != 4: continue
        source_lang, source_text, target_text, count = parts
        source_tokens = tokenizer(source_text, en_tokenizer)
        target_tokens = tokenizer(target_text, fr_tokenizer)
        if len(source_tokens) > 3 or len(target_tokens) > 3: continue
        count = int(count)
        if count > 1:
            if (re.sub('\p{P}', '', source_text[0]) == ''
                    or re.sub('\p{P}', '', target_text[0]) == ''
                    or re.sub('\p{P}', '', source_text[-1]) == ''
                    or re.sub('\p{P}', '', target_text[-1]) == ''):
                continue
            pair = ' ||| '.join([
                source_lang, ' '.join(source_tokens), ' '.join(target_tokens)
            ])
            counter[pair] += count
        if line_no % 100000 == 0:
            logging.info((lang, line_no))

    logging.info((lang, "writing pairs to {0}".format(output_fname)))
    with io.open(output_fname, 'w', encoding='utf-8') as out:
        for pair, count in counter.most_common():
            if count < 10:
                break
            out.write('{0} ||| {1}\n'.format(pair, count))
示例#4
0
def segment(args):
    lang = args.lang
    w_tokenizer = WordTokenizer(locale=lang)
    s_tokenizer = SentenceTokenizer(locale=lang)

    if args.only_sent:
        for l in args.input:
            seq = Sequence(l)
            if not seq.empty(): _print(u'\n'.join(s_tokenizer.transform(seq)))

    elif args.only_word:
        for l in args.input:
            seq = Sequence(l)
            if not seq.empty(): _print(u' '.join(w_tokenizer.transform(seq)))

    else:
        for l in args.input:
            seq = Sequence(l)
            sents = s_tokenizer.transform(seq)
            words = w_tokenizer.transform(seq)
            for tokenized_sent in words.split(sents):
                if not tokenized_sent.empty():
                    _print(u' '.join(tokenized_sent.tokens()))
示例#5
0
def segment(args):
  lang  = args.lang
  w_tokenizer = WordTokenizer(locale=lang)
  s_tokenizer = SentenceTokenizer(locale=lang)

  if args.only_sent:
    for l in args.input:
      seq = Sequence(l)
      if not seq.empty(): _print(s_tokenizer.transform(seq))

  elif args.only_word:
    for l in args.input:
      seq = Sequence(l)
      if not seq.empty(): _print(w_tokenizer.transform(seq))

  else:
    for l in args.input:
      seq = Sequence(l)
      sents = s_tokenizer.transform(seq)
      words = w_tokenizer.transform(seq)
      for tokenized_sent in words.split(sents):
        if not tokenized_sent.empty():
          _print(u' '.join(tokenized_sent.tokens()))
def main(input_fname, lang, to_lower=True):
    en_tokenizer = WordTokenizer(locale='en')
    fr_tokenizer = WordTokenizer(locale=lang)

    for line_no, line in enumerate(smart_open(input_fname)):
        data = json.loads(line)
        en_text, pairs = data[0][0][1], data[5]
        if not isinstance(pairs, list):
            logging.error((input_fname, 'not list', pairs))
            continue
        for source, _, targets, _, _, _, _ in pairs:
            if not isinstance(targets, list):
                logging.error((input_fname, 'not list', targets))
                continue
            for target in targets:
                if source in en_text:
                    count = int(target[1]) or 1
                    source = tokenizer(source, en_tokenizer, to_lower)
                    target = tokenizer(target[0], fr_tokenizer, to_lower)
                    if source and target:
                        print('{0} ||| {1} ||| {2}'.format(
                            source, target, count).encode('utf-8'))
        if line_no % 10000 == 0:
            logging.info((input_fname, line_no))
 def word_tokenizer(self):
     word_tokenizer = WordTokenizer(locale=self.language.code)
     return word_tokenizer