def load_lexicon(self, filename=None, stemmed=False): if filename is None: filename = self.filename self._scores = {} re_hashtag = re.compile(r'\#\d+') with codecs.open(filename, 'r', 'utf-8') as f: for line in f: if '|' in line: prefix, _ = line.split(u'|', 1) else: prefix = line.strip() cols = prefix.strip().split() categories = {} for i, tok in enumerate(cols): if i == 0: w = re_hashtag.sub(u'', cols[0]).lower() elif i == 1: continue else: categories[tok.lower().strip('*')] = 1.0 if stemmed: w = porter_stem(w) if w not in self._scores: self._scores[w] = categories else: self._scores[w].update(categories) return u'Read {} items from {} (<{}>).'.format( len(self._scores), self.__class__.__name__, os.path.relpath(filename, _parentdir))
def _analyze(self, word): stem = porter_stem(word) lemma = list(self.analyzer.analyze( [[word]]))[0][0][1].split('||')[0].split('<')[0] cand_krs = self.morph_analyzer.analyze([[word]]).next().next() candidates = [cand.split('||')[0].split('<')[0] for cand in cand_krs] self.cache[word] = (stem, lemma, candidates)
def remove_stems(text, output_file): dictionary_dir = os.path.join(dir, 'dictionaries/') md = defaultdict(list) for word in text: word = word.strip() md[porter_stem(word)].append(word) with open(os.path.join(dictionary_dir, output_file), 'wb') as output: for k,v in md.iteritems(): output.write(v[0] + "\n") if len(v) > 1: print v
def remove_stems_from_file(): dir = os.getcwd() dictionary_dir = os.path.join(dir, 'dictionaries/') md = defaultdict(list) with open(os.path.join(dictionary_dir, 'temp-extend-technology')) as f: for word in f: word = word.strip() md[porter_stem(word)].append(word) with open(os.path.join(dictionary_dir, 'temp-extend-technology-unique'), 'wb') as output: for k,v in md.iteritems(): output.write(v[0] + "\n") if len(v) > 1: print v
def load_lexicon(self, filenames=None, stemmed=False): if filenames is None: filenames = self.filenames self._scores = {} for (score, filename) in zip((1.0, -1.0), filenames): with codecs.open(filename, 'r', 'utf-8') as f: for line in f: w = line.strip() if stemmed: w = porter_stem(w) if w not in self._scores: self._scores[w] = {'+ve': score} return u'Read {} items from {} (<{}>, <{}>).'.format( len(self._scores), self.__class__.__name__, os.path.relpath(filenames[0], _parentdir), os.path.relpath(filenames[1], _parentdir))
def load_lexicon(self, filename=None, stemmed=False): if filename is None: filename = self.filename self._embeddings = {} self.dimensions = 50 with codecs.open(filename, 'r', 'utf-8') as f: for line in f: cols = line.split(u'\t') assert len(cols) == self.dimensions + 1 w = porter_stem(cols[0]) if stemmed else cols[0] self._embeddings[w] = np.array(cols[1:]) self._unk_embedding = self._embeddings['<unk>'] return u'Read {} embeddings from {} (<{}>).'.format( len(self._embeddings), self.__class__.__name__, os.path.relpath(filename, _parentdir))
def load_lexicon(self, filename=None, stemmed=False): if filename is None: filename = self.filename self._scores = {} with codecs.open(filename, 'r', 'utf-8') as f: for line in f: w, c, s = self._parse_line(line) s = float(s) if s == 0.0: continue if stemmed: w = porter_stem(w) if w not in self._scores: self._scores[w] = {} self._scores[w][c] = s return u'Read {} items from {} (<{}>).'.format( len(self._scores), self.__class__.__name__, os.path.relpath(filename, _parentdir))
def load_lexicon(self, filename=None, stemmed=False): if filename is None: filename = self.filename self._scores = {} with codecs.open(filename, 'r', 'utf-8') as f: for line in f: id_, eff, words, _ = line.strip().split(u'\t', 3) if eff == 'Null': continue eff = eff[0] + 'eff' for w in words.split(u','): w = w.replace(u'_', u' ') if stemmed: w = porter_stem(w) if w not in self._scores: self._scores[w] = {} self._scores[w][eff] = 1.0 return u'Read {} items from {} (<{}>).'.format( len(self._scores), self.__class__.__name__, os.path.relpath(filename, _parentdir))
def __init__(self, filename=None, stemmed=False): if filename is None: filename = self.filename cluster_string_to_id = {} clusters_count = 0 self._clusters = {} with codecs.open(filename, 'r', 'utf-8') as f: for line in f: try: c, w, _ = line.split(u'\t', 2) if c not in cluster_string_to_id: clusters_count += 1 cluster_string_to_id[c] = clusters_count if stemmed: w = porter_stem(w) self._clusters[w] = cluster_string_to_id[c] except ValueError: pass logger.debug('Read {} words and {} clusters for {} (<{}>).'.format(len(self._clusters), clusters_count, self.__class__.__name__, os.path.relpath(filename, _parentdir)))
def load_lexicon(self, filename=None, stemmed=False): if filename is None: filename = self.filename self._scores = {} with codecs.open(filename, 'r', 'utf-8') as f: for line in f: m = self.re_line.match(line) if m is None: raise Exception( 'Unable to parse line in MPQASubjectivityLexicon: {}'. format(line.strip())) prior = m.group(6) if prior == 'neutral': continue w = m.group(3) if stemmed: w = porter_stem(w) strength = m.group(1) score = 1.0 if strength == 'strongsubj' else 0.5 if w not in self._scores: self._scores[w] = {} if prior == 'both': self._scores[w]['+ve'] = score self._scores[w]['-ve'] = score elif prior == 'positive': self._scores[w]['+ve'] = score elif prior == 'negative': self._scores[w]['-ve'] = score elif prior == 'weakpos': self._scores[w]['+ve'] = score * 0.5 elif prior == 'weakneg': self._scores[w]['-ve'] = score * 0.5 return u'Read {} items from {} (<{}>).'.format( len(self._scores), self.__class__.__name__, os.path.relpath(filename, _parentdir))
def load_lexicon(self, filename=None, stemmed=False): if filename is None: filename = self.filename self._scores = {} re_sense = re.compile(r'\#\d+$') with codecs.open(filename, 'r', 'utf-8') as f: for line in f: if line.startswith('#'): continue pos, id_, pos_score, neg_score, synset, _ = line.split( u'\t', 5) if not id_: continue for w in synset.split(): w = re_sense.sub('', w) if stemmed: w = porter_stem(w) self._scores[w] = { 'pos': float(pos_score), 'neg': float(neg_score) } return u'Read {} items from {} (<{}>).'.format( len(self._scores), self.__class__.__name__, os.path.relpath(filename, _parentdir))