def wikicorpus_es(n=200, m=3, map=lambda tag: tag, offset=0): """ Returns a (sentences, lemmata, entities)-tuple from the Spanish Wikicorpus: http://www.lsi.upc.edu/%7Enlp/wikicorpus/ Each sentence in the list of n sentences is a list of (word, tag)-tuples. Each sentence has at least m words. Tags are mapped using the given map function. The lemmata is a dictionary of (word, lemma)-items. The entities is a dictionary of (words, frequency)-items. """ # <doc id="267233" title="Deltoide" dbindex="85004"> # ; ; Fx 0 # En en NP00000 0 # geometríıa geometríıa NCFS000 0 # , , Fc 0 # un uno DI0MS0 0 # deltoide deltoide NCFS000 0 # es ser VSIP3S0 01775973 # un uno DI0MS0 0 # cuadrilátero cuadrilátero NCMS000 0 # no no RN 0 # regular regular AQ0CS0 01891762 # ... # </doc> sentences, lemmata, entities = [], {}, {} sentence = [] i = 0 for f in glob("tagged.es/*")[offset:]: for s in u(open(f).read()).encode("utf-8").split("\n"): if s and not s.startswith(("<doc", "</doc")): word, lemma, tag, x = s.split(" ") word, lemma = word.replace("_", " "), lemma.replace("_", " ") if word in ("ENDOFARTICLE", "REDIRECT", "Acontecimientos", "Fallecimientos", "Nacimientos"): continue if not " " in word: lemmata[word] = lemma # Wikicorpus sometimes bundles consecutive words, # but the Brill tagger only handles unigrams, # so we split into individual words. for word in word.split(" "): sentence.append((word, map(tag))) if tag.startswith("NP"): entities[word] = entities.get(word, 0) + 1 if tag == "Fp" and word == "." and len(sentence) >= m: sentences.append(sentence) sentence = [] i += 1 #if i % 100 == 0: # print i if i >= n: return sentences, lemmata, entities
# Each tweet is assigned an id based on the query, tweet message and date. # Those with an id already in the CSV files don't need to pass through Google translate. SEEN = {} for f in glob.glob("harvest*.txt"): f = open(f) s = f.read(); f.close() if s == "": continue s = s.strip().split("\n") s = [x.split("\t") for x in s] for x in s: if len(x) != 10: # This row wasn't saved correctly, probably something to do with \n or \t print "check %s %s" % (f, s) else: SEEN[web.u(x[0])] = True # -------------------------------------------------------------------------------------------------- # After a few days we stopped mining for all candidates # and simply scanned for tweets on the top 100 most frequently mentioned. #top100 = [ # u'bart de wever', u'didier reynders', u'marianne thyssen', u'alexandra colen', u'alexander de croo', # u'charles michel', u'yves leterme', u'jo\xeblle milquet', u'caroline gennez', u'elio di rupo', # u'pieter de crem', u'olivier maingain', u'michel daerden', u'freya piryns', u'filip dewinter', # u'louis michel', u'rik torfs', u'eva brems', u'johan vande lanotte', u'fran\xe7ois bellot', # u'frank vandenbroucke', u'jean-marie dedecker', u'steven vanackere', u'geert bourgeois', # u'anne de baetzelier', u'guy vanhengel', u'geert lambert', u'philippe moureaux', u'stefaan de clerck', # u'paul magnette', u'pascal smet', u'bert anciaux', u'annemie turtelboom', u'siegfried bracke', # u'bruno tobback', u'rudy demotte', u'alain destexhe', u'rudy aernoudt', u'wouter de vriendt', # u'lode vereeck', u'laurette onkelinx', u'alain mathot', u'bruno valkeniers', u'carl devlies', # u'sabine laruelle', u'rik daems', u'hans bonte', u'dirk van der maelen', u'tom maes', u'melchior wathelet',
for tag in suffix[x]: suffix[x][tag] /= f # Sort by word frequency and top tag frequency. suffix[x] = [(v, k) for k, v in suffix[x].items()] suffix[x] = sorted(suffix[x], reverse=True) suffix[x] = (f, suffix[x][0]) # Keep high tag frequency (v[1][0] > 80%) for suffix. # Remove infrequent words (v[0] < 10). # You can play around with the 80&/10 threshold to see if accuracy increases. suffix = [(v, k) for k, v in suffix.items() if 1.0 >= v[1][0] >= 0.80 and v[0] >= 10] suffix = sorted(suffix, reverse=True) lexical = [] for (count, (frequency, tag)), x in suffix: x = u(x).encode("utf-8") r = "%s %s fhassuf %s %s x" % (DEFAULT, x, len(u(x)), tag) lexical.append(r) if len(lexical) == 100: # Number of lexical rules. break open("brill-lexical.txt", "w").write("\n".join(lexical)) # 9) Convert NLTK base tagger to Brill's text file format. # Exclude the anonymized Named_Entity, we'll add top NE in the next step. print "Generating lexicon..." lexicon, seen = {}, {} for x in base._context_to_tag.items(): if isinstance(base, UnigramTagger): word, tag = x if isinstance(base, BigramTagger):
suffix[x][tag] /= f # Sort by word frequency and top tag frequency. suffix[x] = [(v, k) for k, v in suffix[x].items()] suffix[x] = sorted(suffix[x], reverse=True) suffix[x] = (f, suffix[x][0]) # Keep high tag frequency (v[1][0] > 80%) for suffix. # Remove infrequent words (v[0] < 10). # You can play around with the 80&/10 threshold to see if accuracy increases. suffix = [(v, k) for k, v in suffix.items() if 1.0 >= v[1][0] >= 0.80 and v[0] >= 10] suffix = sorted(suffix, reverse=True) lexical = [] for (count, (frequency, tag)), x in suffix: x = u(x).encode("utf-8") r = "%s %s fhassuf %s %s x" % (DEFAULT, x, len(u(x)), tag) lexical.append(r) if len(lexical) == 100: # Number of lexical rules. break open("brill-lexical.txt", "w").write("\n".join(lexical)) # 9) Convert NLTK base tagger to Brill's text file format. # Exclude the anonymized Named_Entity, we'll add top NE in the next step. print "Generating lexicon..." lexicon, seen = {}, {} for x in base._context_to_tag.items(): if isinstance(base, UnigramTagger): word, tag = x if isinstance(base, BigramTagger):