def bbc_parser(doc, category): 
  try:
    signal.alarm(15)
    s = doc.split("\n")
    title = s[0]
    _id = normalize_title(title)

    tn = TextNormalizer()
    pos, nouns, ners = semantics(doc)
    nouns = tn.fmap(nouns)
    ners = tn.fmap(ners)
    
    paragraphs = []
    sentences = []
    for line in s[1:]:
      if not line:
        continue
      sentences += tn.fmap(sentence_tokenize(line))
      paragraphs.append( tn.normalize(line) )

    signal.alarm(0)
    return {
      "id" : _id,
      "title" : title,
      "sents" : sentences,
      "paras" : paragraphs,
      "pos" : pos,
      "nouns" : nouns,
      "ners" : ners
    }
  except Exception as e:
    print("Could not process article")
  signal.alarm(0)
  return None
Exemplo n.º 2
0
def read_wikistats(lang, f):
    """
    Read wikistats and process redirect pages
    """
    try:
        for line in f:
            try:
                field = line.split()
                if lang == field[0]:
                    page = field[1]
                    if utils.is_valid_title(page) and utils.is_title_in_ns0(page):
                        if not REDIRECTS:
                            print line,
                        else:
                            title = utils.normalize_title(page)
                            if title:
                                pagecounts[title] = pagecounts.get(title, 0) + int(field[2])
            except UnicodeError:
                sys.stderr.write("UnicodeError: %s" % line)
            except IndexError:
                sys.stderr.write("IndexError: %s" % line)
    except IOError:
        sys.stderr.write("IOError")
    finally:
        if f:
            f.close()
Exemplo n.º 3
0
 def new(cls, name):
     tag = Tag(name=name, normal=normalize_title(name))
     tag.put()
     return tag