def test_blank(self): """ does not allow blank lines from happening """ cp = CorpusParser() results = list(cp.parse(self.blank)) self.assertEqual(0, len(results))
def splitter(sources): res_train, res_test = defaultdict(dict), defaultdict(dict) for source in sources: # Parse corpus parser = CorpusParser(input_dir=source, ldir="./log/") parser.parse() # Get and transform to DataFrame reviews = parser.get_parsed() reviews = pd.DataFrame(reviews) # Split train/test np.random.seed(42) msk = np.random.rand(len(reviews)) < 0.8 train, test = reviews[msk], reviews[~msk] # Retransform to dict train, test = train.T.to_dict().values(), test.T.to_dict().values() # Add to result res_train[source] = train res_test[source] = test return res_train, res_test
def test_parse(self): """ will parse a brown corpus line using the standard / notation """ cp = CorpusParser() null = CorpusParser.TagWord('START', 'START') several = CorpusParser.TagWord('Several', 'ap') defendants = CorpusParser.TagWord('defendants', 'nns') period = CorpusParser.TagWord('.', '.') expectations = [[null, several], [several, defendants], [defendants, period]] results = list(cp.parse(self.stream)) self.assertListEqual(expectations, results)
def test_parse(self): """will parse a brown corpus line using the standard / notation""" cp = CorpusParser() null = CorpusParser.TagWord('START', 'START') several = CorpusParser.TagWord('Several', 'ap') defendants = CorpusParser.TagWord('defendants', 'nns') period = CorpusParser.TagWord('.', '.') expectations = [ [null, several], [several, defendants], [defendants, period] ] results = list(cp.parse(self.stream)) self.assertListEqual(expectations, results)
class POSTagger(object): """ This class is responsible for tagging new data given the corpus training data. 此類別針對已知語料訓練資料來標記新資料 """ class LazyFile(object): """ The class wrapping an iterator of a file object in an iterator, which opens the file when iterated. 此類別在迭代器中包裝文件對象的迭代器 , 在迭代時打開該文件 """ def __init__(self, filename): self.filename = filename self.file = None def __iter__(self): self.file = open(self.filename, 'r') return self def __next__(self): try: line = next(self.file) except StopIteration as e: self.file.close() raise e return line def next(self): return self.__next__() @classmethod def from_filepaths(cls, training_files, eager=False): """ Create POSTagger from list of file names 從文件名列表創建 POSTagger :param training_files: list of file names :param eager: boolean: train while opening :return: POStagger """ lazy_files = [POSTagger.LazyFile(fn) for fn in training_files] return POSTagger(lazy_files, eager) def __init__(self, data_io=(), eager=False): self.corpus_parser = CorpusParser() self.data_io = data_io self.trained = False if eager: self.train() self.trained = True def train(self): if not self.trained: self.tags = set() self.tag_combos = defaultdict(int) self.tag_frequencies = defaultdict(int) self.word_tag_combos = defaultdict(int) for io in self.data_io: for line in io: for ngram in self.corpus_parser.parse(line): self.write(ngram) self.trained = True def write(self, ngram): """ :param ngram: """ if ngram[0].tag == 'START': self.tag_frequencies['START'] += 1 self.word_tag_combos['START/START'] += 1 self.tags.add(ngram[-1].tag) self.tag_frequencies[ngram[-1].tag] += 1 combo = ngram[-1].word + '/' + ngram[-1].tag self.word_tag_combos[combo] += 1 combo = ngram[0].tag + '/' + ngram[-1].tag self.tag_combos[combo] += 1 def viterbi(self, sentence): sentence1 = re.sub(r'([\.\?!])', r' \1', sentence) parts = re.split(r'\s+', sentence1) last_viterbi = {} backpointers = ['START'] for tag in self.tags: if tag == 'START': continue else: probability = self.tag_probability('START', tag) \ * self.word_tag_probability(parts[0], tag) if probability > 0: last_viterbi[tag] = probability if len(last_viterbi) > 0: backpointer = max(last_viterbi, key=(lambda key: last_viterbi[key])) else: backpointer = max(self.tag_frequencies, key=(lambda key: self.tag_frequencies[key])) backpointers.append(backpointer) for part in parts[1:]: viterbi = {} for tag in self.tags: if tag == 'START': continue if len(last_viterbi) == 0: break best_tag = max(last_viterbi, key=(lambda prev_tag: last_viterbi[prev_tag] * self.tag_probability(prev_tag, tag) * self. word_tag_probability(part, tag))) probability = last_viterbi[best_tag] * \ self.tag_probability(best_tag, tag) * \ self.word_tag_probability(part, tag) if probability > 0: viterbi[tag] = probability last_viterbi = viterbi if len(last_viterbi) > 0: backpointer = max(last_viterbi, key=(lambda key: last_viterbi[key])) else: backpointer = max(self.tag_frequencies, key=(lambda key: self.tag_frequencies[key])) backpointers.append(backpointer) return backpointers def tag_probability(self, previous_tag, current_tag): """Maximum likelihood estimate 最大概率估算 count(previous_tag, current_tag) / count(previous_tag)""" denom = self.tag_frequencies[previous_tag] if denom == 0: return 0 else: return self.tag_combos[previous_tag + '/' + current_tag] / float(denom) def word_tag_probability(self, word, tag): """Maximum Likelihood estimate 最大概率估算 count (word and tag) / count(tag)""" denom = self.tag_frequencies[tag] if denom == 0: return 0 else: return self.word_tag_combos[word + '/' + tag] / float(denom) def probability_of_word_tag(self, words, tags): if len(words) != len(tags): raise ValueError('The word and tags must be the same length!') length = len(words) probability = 1.0 for i in range(1, length): probability *= self.tag_probability(tags[i - 1], tags[i]) * \ self.word_tag_probability(words[i], tags[i]) return probability
import sys sys.path.append('./src/') from corpus_parser import CorpusParser from corpus_attributes import CorpusAttributes from time import clock # Pick the corpus to parse corpus_parser = CorpusParser(sys.argv) # Parse it corpus_parser.parse() # Create the attributes object corpus_attributes = CorpusAttributes(corpus_parser) # Get aggregate statistics on attributes over time corpus_attributes.get_attributes() corpus_attributes.get_stats(sys.argv[3]) corpus_attributes.print_stats()
def test_blank(self): """does not allow blank lines from happening""" cp = CorpusParser() results = list(cp.parse(self.blank)) self.assertEqual(0, len(results))
import sys sys.path.append('./src/') from corpus_parser import CorpusParser from naive_sentiment import NaiveSentiment # Pick the corpus to parse corpus_parser = CorpusParser(sys.argv) # Parse it corpus_parser.parse() # Get stats for the particular corpus naive_sentiment = NaiveSentiment(corpus_parser) # Parse articles for content words and print their frequencies naive_sentiment.get_stats() naive_sentiment.print_stats()
class POSTagger(object): """ This class is responsible for tagging new data given the corpus training data. """ class LazyFile(object): """ The class wrapping an iterator of a file object in an iterator, which opens the file when iterated. """ def __init__(self, filename): self.filename = filename self.file = None def __iter__(self): self.file = open(self.filename, 'r') return self def __next__(self): try: line = next(self.file) except StopIteration as e: self.file.close() raise e return line def next(self): return self.__next__() @classmethod def from_filepaths(cls, training_files, eager=False): """ Create POSTagger from list of file names :param training_files: list of file names :param eager: boolean: train while opening :return: POStagger """ lazy_files = [POSTagger.LazyFile(fn) for fn in training_files] return POSTagger(lazy_files, eager) def __init__(self, data_io=(), eager=False): self.corpus_parser = CorpusParser() self.data_io = data_io self.trained = False if eager: self.train() self.trained = True def train(self): if not self.trained: self.tags = set() self.tag_combos = defaultdict(int) self.tag_frequencies = defaultdict(int) self.word_tag_combos = defaultdict(int) for io in self.data_io: for line in io: for ngram in self.corpus_parser.parse(line): self.write(ngram) self.trained = True def write(self, ngram): """ :param ngram: """ if ngram[0].tag == 'START': self.tag_frequencies['START'] += 1 self.word_tag_combos['START/START'] += 1 self.tags.add(ngram[-1].tag) self.tag_frequencies[ngram[-1].tag] += 1 combo = ngram[-1].word + '/' + ngram[-1].tag self.word_tag_combos[combo] += 1 combo = ngram[0].tag + '/' + ngram[-1].tag self.tag_combos[combo] += 1 def viterbi(self, sentence): sentence1 = re.sub(r'([\.\?!])', r' \1', sentence) parts = re.split(r'\s+', sentence1) last_viterbi = {} backpointers = ['START'] for tag in self.tags: if tag == 'START': continue else: probability = self.tag_probability('START', tag) \ * self.word_tag_probability(parts[0], tag) if probability > 0: last_viterbi[tag] = probability if len(last_viterbi) > 0: backpointer = max(last_viterbi, key=(lambda key: last_viterbi[key])) else: backpointer = max(self.tag_frequencies, key=(lambda key: self.tag_frequencies[key])) backpointers.append(backpointer) for part in parts[1:]: viterbi = {} for tag in self.tags: if tag == 'START': continue if len(last_viterbi) == 0: break best_tag = max(last_viterbi, key=(lambda prev_tag: last_viterbi[prev_tag] * self.tag_probability(prev_tag, tag) * self.word_tag_probability(part, tag))) probability = last_viterbi[best_tag] * \ self.tag_probability(best_tag, tag) * \ self.word_tag_probability(part, tag) if probability > 0: viterbi[tag] = probability last_viterbi = viterbi if len(last_viterbi) > 0: backpointer = max(last_viterbi, key=(lambda key: last_viterbi[key])) else: backpointer = max(self.tag_frequencies, key=(lambda key: self.tag_frequencies[key])) backpointers.append(backpointer) return backpointers def tag_probability(self, previous_tag, current_tag): """Maximum likelihood estimate count(previous_tag, current_tag) / count(previous_tag)""" denom = self.tag_frequencies[previous_tag] if denom == 0: return 0 else: return self.tag_combos[previous_tag + '/' + current_tag] / float(denom) def word_tag_probability(self, word, tag): """Maximum Likelihood estimate count (word and tag) / count(tag)""" denom = self.tag_frequencies[tag] if denom == 0: return 0 else: return self.word_tag_combos[word + '/' + tag] / float(denom) def probability_of_word_tag(self, words, tags): if len(words) != len(tags): raise ValueError('The word and tags must be the same length!') length = len(words) probability = 1.0 for i in range(1, length): probability *= self.tag_probability(tags[i - 1], tags[i]) * \ self.word_tag_probability(words[i], tags[i]) return probability
class POSTagger: def __init__(self, data_io): self.corpus_parser = CorpusParser() self.data_io = data_io self.trained = False def train(self): if not self.trained: self.tags = set(['Start']) self.tag_combos = defaultdict(lambda: 0, {}) self.tag_frequencies = defaultdict(lambda: 0, {}) self.word_tag_combos = defaultdict(lambda: 0, {}) for io in self.data_io: for line in io.read_lines(): for ngram in self.corpus_parser.parse(line): write(ngram) self.trained = True def write(self, ngram): if ngram[0].tag == 'START': self.tag_frequencies['START'] += 1 self.word_tag_combos['START/START'] += 1 self.tags.append(ngram[-1]) self.tag_frequencies[ngram[-1].tag] += 1 self.word_tag_combos['/'.join([ngram[-1].word, ngram[-1].tag])] += 1 self.tag_combos["/".join([ngram[0].tag, ngram[-1].tag])] += 1 def tag_probability(previous_tag, current_tag): denom = self.tag_frequencies[previous_tag] if denom == 0: return 0.0 else: return self.tag_combos['/'.join(previous_tag, current_tag)] / float(denom) def word_tag_probability(word, tag): denom = self.tag_frequencies[tag] if denom == 0: return 0.0 else: self.word_tag_combos["/".join(word, tag)] / float(denom) def probability_of_word_tag(word_sequence, tag_sequence): if len(word_sequence) != len(tag_sequence): raise Exception('The word and tags must be the same length...') length = len(word_sequence) probability = 1.0 for i in range(1, length): probability *= ( tag_probability(tag_sequence[i - 1], tag_sequence[i]) * word_tag_probability(word_sequence[i], tag_sequence[i]) ) return probability def viterbi(self, sentence): parts = re.sub(r"([\.\?!])", r" \1", sentence) last_viterbi = {} backpointers = ['START'] for tag in self.tags: if tag == 'START': next() else: probability = tag_probability('START', tag) * \ word_tag_probability(parts[0], tag) if probability > 0: last_viterbi[tag] = probability backpointers.append( max(v for v in last_viterbi.values()) or max(v for c in self.tag_frequencies.values()) ) for part in parts[1:]: viterbi = {} for tag in self.tags: if tag == 'START': next() if last_viterbi: break best_previous = max( for ((prev_tag, probability) in last_viterbi.iteritems()): probability * \ tag_probability(prev_tag, tag) * \ word_tag_probability(part,tag) ) best_tag = best_previous[0] probability = last_viterbi[best_tag] * \ tag_probability(best_tag, tag) * \ word_tag_probability(part, tag) if probability > 0: viterbi[tag] = probability last_viterbi = viterbi backpointers << ( max(v for v in last_viterbi.itervalues()) or max(v for v in self.tag_frequencies.itervalues()) ) return backpointers