예제 #1
0
 def __init__(self, path, words, dim=300, normalize=True, **kwargs):
     seen = []
     vs = {}
     for line in lines(path):
         split = line.split()
         w = split[0]
         if w in words:
             seen.append(w)
             vs[w] = np.array(map(float, split[1:]), dtype='float32')
     self.iw = seen
     self.wi = {w: i for i, w in enumerate(self.iw)}
     self.m = np.vstack(vs[w] for w in self.iw)
     if normalize:
         self.normalize()
예제 #2
0
def make_bingliu_lexicon():
    polarities = {}
    for polarity in ['positive', 'negative']:
        for line in util.lines(
                constants.LEXICONS +
                'bl_opinion_lexicon/{:}-words.txt'.format(polarity)):
            try:
                line = line.strip().encode('ascii', 'ignore')
                if len(line) == 0 or line[0] == ';':
                    continue
                polarities[line] = 1 if polarity == 'positive' else -1
            except:
                print("skipping", line)
    util.write_json(polarities, constants.PROCESSED_LEXICONS + 'bingliu.json')
예제 #3
0
def make_twitter_lexicon():
    polarities = {}
    for line in util.lines(
            constants.LEXICONS +
            "twitter/MaxDiff-Twitter-Lexicon/Maxdiff-Twitter-Lexicon_-1to1.txt"
    ):
        info = line.split()
        if len(info[1].split()) > 1:
            continue
        if float(info[0]) < 0:
            polarities[info[1]] = -1
        else:
            polarities[info[1]] = 1
    util.write_json(polarities, constants.PROCESSED_LEXICONS + 'twitter.json')
예제 #4
0
 def __init__(self, path, words, dim=300, normalize=True, **kwargs):
     seen = []
     vs = {}
     for line in lines(path):
         split = line.split()
         w = split[0]
         if w in words:
             seen.append(w)
             vs[w] = np.array(map(float, split[1:]), dtype='float32')
     self.iw = seen
     self.wi = {w:i for i,w in enumerate(self.iw)}
     self.m = np.vstack(vs[w] for w in self.iw)
     if normalize:
         self.normalize()
예제 #5
0
def make_qwn_lexicon():
    polarities = collections.defaultdict(float)
    for line in util.lines(
            constants.LEXICONS +
            "qwn/turneyLittman_propSyn_08_mcr30-noAntGloss.dict"):
        info = line.split("\t")
        if info[1] == "neg":
            mod = -1
        else:
            mod = 1
        for word in info[2].split(", "):
            if not "_" in word:
                polarities[word.split("#")[0]] += mod
    polarities = {
        word: np.sign(val)
        for word, val in polarities.iteritems() if val != 0
    }
    util.write_json(polarities, constants.PROCESSED_LEXICONS + 'qwn.json')
예제 #6
0
def make_inquirer_lexicon():
    polarities = {}
    for line in util.lines(constants.LEXICONS + 'inquirerbasic.csv'):
        for l in line.strip().split('\r'):
            split = l.split(",")
            w = split[0].lower()
            if "#" in w:
                if w.split("#")[1] != "1":
                    continue
                w = w.split("#")[0]
            polarity_neg = split[-1]
            polarity_pos = split[-2]
            if polarity_neg == 'Negativ' and polarity_pos == 'Positiv':
                continue
            elif polarity_neg == 'Negativ':
                polarities[w] = -1
            elif polarity_pos == 'Positiv':
                polarities[w] = 1
            else:
                polarities[w] = 0

    util.write_json(polarities, constants.PROCESSED_LEXICONS + 'inquirer.json')
예제 #7
0
def make_140_scores_lexicon():
    polarities = {}
    for line in util.lines(constants.LEXICONS + "Sentiment140-Lexicon-v0.1/unigrams-pmilexicon.txt"):
        info = line.split()
        polarities[info[0]] = float(info[1])
    util.write_json(polarities, constants.PROCESSED_LEXICONS + '140-scores.json')
예제 #8
0
def get_vocab(fname, limit=None):
    """Load vocabulary file and return as list with a given length limit."""
    vocab = [line.split()[0] for line in lines(fname)]
    return vocab[:limit]