def __init__(self, path, words, dim=300, normalize=True, **kwargs): seen = [] vs = {} for line in lines(path): split = line.split() w = split[0] if w in words: seen.append(w) vs[w] = np.array(map(float, split[1:]), dtype='float32') self.iw = seen self.wi = {w: i for i, w in enumerate(self.iw)} self.m = np.vstack(vs[w] for w in self.iw) if normalize: self.normalize()
def make_bingliu_lexicon(): polarities = {} for polarity in ['positive', 'negative']: for line in util.lines( constants.LEXICONS + 'bl_opinion_lexicon/{:}-words.txt'.format(polarity)): try: line = line.strip().encode('ascii', 'ignore') if len(line) == 0 or line[0] == ';': continue polarities[line] = 1 if polarity == 'positive' else -1 except: print("skipping", line) util.write_json(polarities, constants.PROCESSED_LEXICONS + 'bingliu.json')
def make_twitter_lexicon(): polarities = {} for line in util.lines( constants.LEXICONS + "twitter/MaxDiff-Twitter-Lexicon/Maxdiff-Twitter-Lexicon_-1to1.txt" ): info = line.split() if len(info[1].split()) > 1: continue if float(info[0]) < 0: polarities[info[1]] = -1 else: polarities[info[1]] = 1 util.write_json(polarities, constants.PROCESSED_LEXICONS + 'twitter.json')
def __init__(self, path, words, dim=300, normalize=True, **kwargs): seen = [] vs = {} for line in lines(path): split = line.split() w = split[0] if w in words: seen.append(w) vs[w] = np.array(map(float, split[1:]), dtype='float32') self.iw = seen self.wi = {w:i for i,w in enumerate(self.iw)} self.m = np.vstack(vs[w] for w in self.iw) if normalize: self.normalize()
def make_qwn_lexicon(): polarities = collections.defaultdict(float) for line in util.lines( constants.LEXICONS + "qwn/turneyLittman_propSyn_08_mcr30-noAntGloss.dict"): info = line.split("\t") if info[1] == "neg": mod = -1 else: mod = 1 for word in info[2].split(", "): if not "_" in word: polarities[word.split("#")[0]] += mod polarities = { word: np.sign(val) for word, val in polarities.iteritems() if val != 0 } util.write_json(polarities, constants.PROCESSED_LEXICONS + 'qwn.json')
def make_inquirer_lexicon(): polarities = {} for line in util.lines(constants.LEXICONS + 'inquirerbasic.csv'): for l in line.strip().split('\r'): split = l.split(",") w = split[0].lower() if "#" in w: if w.split("#")[1] != "1": continue w = w.split("#")[0] polarity_neg = split[-1] polarity_pos = split[-2] if polarity_neg == 'Negativ' and polarity_pos == 'Positiv': continue elif polarity_neg == 'Negativ': polarities[w] = -1 elif polarity_pos == 'Positiv': polarities[w] = 1 else: polarities[w] = 0 util.write_json(polarities, constants.PROCESSED_LEXICONS + 'inquirer.json')
def make_140_scores_lexicon(): polarities = {} for line in util.lines(constants.LEXICONS + "Sentiment140-Lexicon-v0.1/unigrams-pmilexicon.txt"): info = line.split() polarities[info[0]] = float(info[1]) util.write_json(polarities, constants.PROCESSED_LEXICONS + '140-scores.json')
def get_vocab(fname, limit=None): """Load vocabulary file and return as list with a given length limit.""" vocab = [line.split()[0] for line in lines(fname)] return vocab[:limit]