示例#1
0
def wikicorpus_es(n=200, m=3, map=lambda tag: tag, offset=0):
    """ Returns a (sentences, lemmata, entities)-tuple from the Spanish Wikicorpus:
        http://www.lsi.upc.edu/%7Enlp/wikicorpus/
        Each sentence in the list of n sentences is a list of (word, tag)-tuples.
        Each sentence has at least m words.
        Tags are mapped using the given map function.
        The lemmata is a dictionary of (word, lemma)-items.
        The entities is a dictionary of (words, frequency)-items.
    """
    # <doc id="267233" title="Deltoide" dbindex="85004">
    # ; ; Fx 0
    # En en NP00000 0
    # geometríıa geometríıa NCFS000 0
    # , , Fc 0
    # un uno DI0MS0 0
    # deltoide deltoide NCFS000 0
    # es ser VSIP3S0 01775973
    # un uno DI0MS0 0
    # cuadrilátero cuadrilátero NCMS000 0
    # no no RN 0
    # regular regular AQ0CS0 01891762
    # ...
    # </doc>
    sentences, lemmata, entities = [], {}, {}
    sentence = []
    i = 0
    for f in glob("tagged.es/*")[offset:]:
        for s in u(open(f).read()).encode("utf-8").split("\n"):
            if s and not s.startswith(("<doc", "</doc")):
                word, lemma, tag, x = s.split(" ")
                word, lemma = word.replace("_", " "), lemma.replace("_", " ")
                if word in ("ENDOFARTICLE", "REDIRECT", "Acontecimientos",
                            "Fallecimientos", "Nacimientos"):
                    continue
                if not " " in word:
                    lemmata[word] = lemma
                # Wikicorpus sometimes bundles consecutive words,
                # but the Brill tagger only handles unigrams,
                # so we split into individual words.
                for word in word.split(" "):
                    sentence.append((word, map(tag)))
                    if tag.startswith("NP"):
                        entities[word] = entities.get(word, 0) + 1
                    if tag == "Fp" and word == "." and len(sentence) >= m:
                        sentences.append(sentence)
                        sentence = []
                        i += 1
                        #if i % 100 == 0:
                        #    print i
                    if i >= n:
                        return sentences, lemmata, entities
示例#2
0
def wikicorpus_es(n=200, m=3, map=lambda tag: tag, offset=0):
    """ Returns a (sentences, lemmata, entities)-tuple from the Spanish Wikicorpus:
        http://www.lsi.upc.edu/%7Enlp/wikicorpus/
        Each sentence in the list of n sentences is a list of (word, tag)-tuples.
        Each sentence has at least m words.
        Tags are mapped using the given map function.
        The lemmata is a dictionary of (word, lemma)-items.
        The entities is a dictionary of (words, frequency)-items.
    """
    # <doc id="267233" title="Deltoide" dbindex="85004">
    # ; ; Fx 0
    # En en NP00000 0
    # geometríıa geometríıa NCFS000 0
    # , , Fc 0
    # un uno DI0MS0 0
    # deltoide deltoide NCFS000 0
    # es ser VSIP3S0 01775973
    # un uno DI0MS0 0
    # cuadrilátero cuadrilátero NCMS000 0
    # no no RN 0
    # regular regular AQ0CS0 01891762
    # ...
    # </doc>
    sentences, lemmata, entities = [], {}, {}
    sentence = []
    i = 0
    for f in glob("tagged.es/*")[offset:]:
        for s in u(open(f).read()).encode("utf-8").split("\n"):
            if s and not s.startswith(("<doc", "</doc")):
                word, lemma, tag, x = s.split(" ")
                word, lemma = word.replace("_", " "), lemma.replace("_", " ")
                if word in ("ENDOFARTICLE", "REDIRECT", "Acontecimientos", "Fallecimientos", "Nacimientos"):
                    continue
                if not " " in word:
                    lemmata[word] = lemma
                # Wikicorpus sometimes bundles consecutive words,
                # but the Brill tagger only handles unigrams,
                # so we split into individual words.
                for word in word.split(" "):
                    sentence.append((word, map(tag)))
                    if tag.startswith("NP"):
                        entities[word] = entities.get(word, 0) + 1
                    if tag == "Fp" and word == "." and len(sentence) >= m:
                        sentences.append(sentence)
                        sentence = []
                        i += 1
                        #if i % 100 == 0:
                        #    print i
                    if i >= n:
                        return sentences, lemmata, entities
示例#3
0
# Each tweet is assigned an id based on the query, tweet message and date.
# Those with an id already in the CSV files don't need to pass through Google translate.
SEEN = {}
for f in glob.glob("harvest*.txt"):
    f = open(f)
    s = f.read(); f.close()
    if s == "":
        continue
    s = s.strip().split("\n")
    s = [x.split("\t") for x in s]
    for x in s:
        if len(x) != 10:
            # This row wasn't saved correctly, probably something to do with \n or \t
            print "check %s %s" % (f, s)
        else:
            SEEN[web.u(x[0])] = True

# --------------------------------------------------------------------------------------------------
# After a few days we stopped mining for all candidates 
# and simply scanned for tweets on the top 100 most frequently mentioned.
#top100 = [
#    u'bart de wever', u'didier reynders', u'marianne thyssen', u'alexandra colen', u'alexander de croo', 
#    u'charles michel', u'yves leterme', u'jo\xeblle milquet', u'caroline gennez', u'elio di rupo', 
#    u'pieter de crem', u'olivier maingain', u'michel daerden', u'freya piryns', u'filip dewinter', 
#    u'louis michel', u'rik torfs', u'eva brems', u'johan vande lanotte', u'fran\xe7ois bellot', 
#    u'frank vandenbroucke', u'jean-marie dedecker', u'steven vanackere', u'geert bourgeois', 
#    u'anne de baetzelier', u'guy vanhengel', u'geert lambert', u'philippe moureaux', u'stefaan de clerck', 
#    u'paul magnette', u'pascal smet', u'bert anciaux', u'annemie turtelboom', u'siegfried bracke', 
#    u'bruno tobback', u'rudy demotte', u'alain destexhe', u'rudy aernoudt', u'wouter de vriendt', 
#    u'lode vereeck', u'laurette onkelinx', u'alain mathot', u'bruno valkeniers', u'carl devlies', 
#    u'sabine laruelle', u'rik daems', u'hans bonte', u'dirk van der maelen', u'tom maes', u'melchior wathelet', 
示例#4
0
    for tag in suffix[x]:
        suffix[x][tag] /= f
    # Sort by word frequency and top tag frequency.
    suffix[x] = [(v, k) for k, v in suffix[x].items()]
    suffix[x] = sorted(suffix[x], reverse=True)
    suffix[x] = (f, suffix[x][0])

# Keep high tag frequency (v[1][0] > 80%) for suffix.
# Remove infrequent words (v[0] < 10).
# You can play around with the 80&/10 threshold to see if accuracy increases.
suffix = [(v, k) for k, v in suffix.items() if 1.0 >= v[1][0] >= 0.80 and v[0] >= 10]
suffix = sorted(suffix, reverse=True)

lexical = []
for (count, (frequency, tag)), x in suffix:
    x = u(x).encode("utf-8")
    r = "%s %s fhassuf %s %s x" % (DEFAULT, x, len(u(x)), tag)
    lexical.append(r)
    if len(lexical) == 100: # Number of lexical rules.
        break

open("brill-lexical.txt", "w").write("\n".join(lexical))

# 9) Convert NLTK base tagger to Brill's text file format.
#    Exclude the anonymized Named_Entity, we'll add top NE in the next step.
print "Generating lexicon..."
lexicon, seen = {}, {}
for x in base._context_to_tag.items():
    if isinstance(base, UnigramTagger):
        word, tag = x
    if isinstance(base, BigramTagger):
示例#5
0
        suffix[x][tag] /= f
    # Sort by word frequency and top tag frequency.
    suffix[x] = [(v, k) for k, v in suffix[x].items()]
    suffix[x] = sorted(suffix[x], reverse=True)
    suffix[x] = (f, suffix[x][0])

# Keep high tag frequency (v[1][0] > 80%) for suffix.
# Remove infrequent words (v[0] < 10).
# You can play around with the 80&/10 threshold to see if accuracy increases.
suffix = [(v, k) for k, v in suffix.items()
          if 1.0 >= v[1][0] >= 0.80 and v[0] >= 10]
suffix = sorted(suffix, reverse=True)

lexical = []
for (count, (frequency, tag)), x in suffix:
    x = u(x).encode("utf-8")
    r = "%s %s fhassuf %s %s x" % (DEFAULT, x, len(u(x)), tag)
    lexical.append(r)
    if len(lexical) == 100:  # Number of lexical rules.
        break

open("brill-lexical.txt", "w").write("\n".join(lexical))

# 9) Convert NLTK base tagger to Brill's text file format.
#    Exclude the anonymized Named_Entity, we'll add top NE in the next step.
print "Generating lexicon..."
lexicon, seen = {}, {}
for x in base._context_to_tag.items():
    if isinstance(base, UnigramTagger):
        word, tag = x
    if isinstance(base, BigramTagger):