예제 #1
0
def main():

    id = 1

    for topic in TOPICS:
        path = PATH + topic
        directory = os.fsencode(path)
        for file in os.listdir(directory):
            filename = os.fsdecode(file)
            if filename.endswith(".json"):
                data = json.load(open(path + '/' + filename))
                data.pop('localpath', None)
                data.pop('date_download', None)
                data.pop('date_modify', None)
                data.pop('filename', None)
                data["id"] = id
                id += 1
                for content in CONTENTS:
                    if content in data.keys() and data[content] != None:
                        new_content = word_tokenize(data[content])
                        data[content] = new_content
                with open(path + '/' + filename, 'w') as outfile:
                    json.dump(data, outfile)
예제 #2
0
    def get_stats(self, generator, arto=False,format_pandas=False):
        """"""

        data = defaultdict(list)
        i = 0
        for t in generator:
            i += 1
            ambig1 = t.ambiguity(stress_polysyll=False)
            ambig2 = t.ambiguity(stress_polysyll=True)
            tree1 = t.max_stress_disambiguate()[0]
            tree1.set_pstress()
            tree1.set_stress()
            tree2a = t.min_stress_disambiguate(stress_polysyll=True)[0]
            tree2a.set_pstress()
            tree2a.set_stress()
            tree2b = t.min_stress_disambiguate(stress_polysyll=False)[0]
            tree2b.set_pstress()
            tree2b.set_stress()

            j = 0
            preterms1 = list(tree1.preterminals())
            min1 = float(min([preterm.stress() for preterm in preterms1 if not np.isnan(preterm.stress())]))
            max1 = max([preterm.stress() for preterm in preterms1 if not np.isnan(preterm.stress())]) - min1
            preterms2a = list(tree2a.preterminals())
            min2a = float(min([preterm.stress() for preterm in preterms2a if not np.isnan(preterm.stress())]))
            max2a = max([preterm.stress() for preterm in preterms2a if not np.isnan(preterm.stress())]) - min2a
            preterms2b = list(tree2b.preterminals())
            min2b = float(min([preterm.stress() for preterm in preterms2b if not np.isnan(preterm.stress())]))
            max2b = max([preterm.stress() for preterm in preterms2b if not np.isnan(preterm.stress())]) - min2b
            preterms_raw = list(t.preterminals())
            minmean = float(min([np.mean([preterm1.stress(), preterm2a.stress(), preterm2b.stress()]) for preterm1, preterm2a, preterm2b in zip(preterms1, preterms2a, preterms2b) if not np.isnan(preterm1.stress())]))
            maxmean = max([np.mean([preterm1.stress(), preterm2a.stress(), preterm2b.stress()]) for preterm1, preterm2a, preterm2b in zip(preterms1, preterms2a, preterms2b) if not np.isnan(preterm1.stress())]) - minmean
            sent = ' '.join([preterm[0] for preterm in preterms_raw])
            sentlen = len(preterms_raw)
            for preterm1, preterm2a, preterm2b, preterm_raw in zip(preterms1, preterms2a, preterms2b, preterms_raw):
                j += 1
                data['widx'].append(j)
                data['norm_widx'].append(float(j) / sentlen if sentlen else np.nan)
                data['word'].append(preterm1[0])
                if preterm_raw._lstress == 0:
                    data['lexstress'].append('yes')
                elif preterm_raw._lstress == -.5:
                    data['lexstress'].append('ambig')
                elif preterm_raw._lstress == -1:
                    data['lexstress'].append('no')
                else:
                    data['lexstress'].append('???')
                data['seg'].append(' '.join(preterm1.seg()))
                data['nseg'].append(preterm1.nseg())
                data['nsyll'].append(preterm1.nsyll())
                data['nstress'].append(preterm1.nstress())
                data['pos'].append(preterm1.category())
                data['dep'].append(preterm1.dependency())
                if arto:
                    data['m1'].append(-(preterm1.stress()-1))
                    data['m2a'].append(-(preterm2a.stress()-1))
                    data['m2b'].append(-(preterm2b.stress()-1))
                    data['mean'].append(-(np.mean([preterm1.stress(), preterm2a.stress(), preterm2b.stress()])-1))
                else:
                    data['m1'].append(preterm1.stress())
                    data['m2a'].append(preterm2a.stress())
                    data['m2b'].append(preterm2b.stress())
                    data['mean'].append(np.mean([preterm1.stress(), preterm2a.stress(), preterm2b.stress()]))
                data['norm_m1'].append((preterm1.stress()-min1)/max1 if max1 else np.nan)
                data['norm_m2a'].append((preterm2a.stress()-min2a)/max2a if max2a else np.nan)
                data['norm_m2b'].append((preterm2b.stress()-min2b)/max2b if max2b else np.nan)
                data['norm_mean'].append((np.mean([preterm1.stress(), preterm2a.stress(), preterm2b.stress()])-minmean)/maxmean if maxmean else np.nan)
                data['sidx'].append(i)
                data['sent'].append(sent)
                data['ambig_words'].append(ambig1)
                data['ambig_monosyll'].append(ambig2)
            data['contour'].extend([' '.join(str(x) for x in data['mean'][-(j):])]*j)

        if format_pandas:
            for k, v in data.iteritems():
                data[k] = pd.Series(v)
            df=pd.DataFrame(data, columns=['widx', 'norm_widx', 'word', 'seg', 'lexstress',
                                               'nseg', 'nsyll', 'nstress',
                                               'pos', 'dep',
                                               'm1', 'm2a', 'm2b', 'mean',
                                               'norm_m1', 'norm_m2a', 'norm_m2b', 'norm_mean',
                                               'sidx', 'sent', 'ambig_words', 'ambig_monosyll',
                                               'contour'])
            return df

        keys=data.keys()
        old=[]
        num_rows=len(data[keys[0]])
        for i_row in range(num_rows):
            dx={}
            for k in keys:
                dx[k]=data[k][i_row]
            old+=[dx]

        return old
예제 #3
0
    def get_stats(self, generator, arto=False, format_pandas=False):
        """"""

        data = defaultdict(list)
        i = 0
        for t in generator:
            i += 1
            ambig1 = t.ambiguity(stress_polysyll=False)
            ambig2 = t.ambiguity(stress_polysyll=True)
            tree1 = t.max_stress_disambiguate()[0]
            tree1.set_pstress()
            tree1.set_stress()
            tree2a = t.min_stress_disambiguate(stress_polysyll=True)[0]
            tree2a.set_pstress()
            tree2a.set_stress()
            tree2b = t.min_stress_disambiguate(stress_polysyll=False)[0]
            tree2b.set_pstress()
            tree2b.set_stress()

            j = 0
            preterms1 = list(tree1.preterminals())
            min1 = float(
                min([
                    preterm.stress() for preterm in preterms1
                    if not np.isnan(preterm.stress())
                ]))
            max1 = max([
                preterm.stress()
                for preterm in preterms1 if not np.isnan(preterm.stress())
            ]) - min1
            preterms2a = list(tree2a.preterminals())
            min2a = float(
                min([
                    preterm.stress() for preterm in preterms2a
                    if not np.isnan(preterm.stress())
                ]))
            max2a = max([
                preterm.stress()
                for preterm in preterms2a if not np.isnan(preterm.stress())
            ]) - min2a
            preterms2b = list(tree2b.preterminals())
            min2b = float(
                min([
                    preterm.stress() for preterm in preterms2b
                    if not np.isnan(preterm.stress())
                ]))
            max2b = max([
                preterm.stress()
                for preterm in preterms2b if not np.isnan(preterm.stress())
            ]) - min2b
            preterms_raw = list(t.preterminals())
            minmean = float(
                min([
                    np.mean([
                        preterm1.stress(),
                        preterm2a.stress(),
                        preterm2b.stress()
                    ]) for preterm1, preterm2a, preterm2b in zip(
                        preterms1, preterms2a, preterms2b)
                    if not np.isnan(preterm1.stress())
                ]))
            maxmean = max([
                np.mean([
                    preterm1.stress(),
                    preterm2a.stress(),
                    preterm2b.stress()
                ]) for preterm1, preterm2a, preterm2b in zip(
                    preterms1, preterms2a, preterms2b)
                if not np.isnan(preterm1.stress())
            ]) - minmean
            sent = ' '.join([preterm[0] for preterm in preterms_raw])
            sentlen = len(preterms_raw)
            for preterm1, preterm2a, preterm2b, preterm_raw in zip(
                    preterms1, preterms2a, preterms2b, preterms_raw):
                j += 1
                data['widx'].append(j)
                data['norm_widx'].append(
                    float(j) / sentlen if sentlen else np.nan)
                data['word'].append(preterm1[0])
                if preterm_raw._lstress == 0:
                    data['lexstress'].append('yes')
                elif preterm_raw._lstress == -.5:
                    data['lexstress'].append('ambig')
                elif preterm_raw._lstress == -1:
                    data['lexstress'].append('no')
                else:
                    data['lexstress'].append('???')
                data['seg'].append(' '.join(preterm1.seg()))
                data['nseg'].append(preterm1.nseg())
                data['nsyll'].append(preterm1.nsyll())
                data['nstress'].append(preterm1.nstress())
                data['pos'].append(preterm1.category())
                data['dep'].append(preterm1.dependency())
                if arto:
                    data['m1'].append(-(preterm1.stress() - 1))
                    data['m2a'].append(-(preterm2a.stress() - 1))
                    data['m2b'].append(-(preterm2b.stress() - 1))
                    data['mean'].append(-(np.mean([
                        preterm1.stress(),
                        preterm2a.stress(),
                        preterm2b.stress()
                    ]) - 1))
                else:
                    data['m1'].append(preterm1.stress())
                    data['m2a'].append(preterm2a.stress())
                    data['m2b'].append(preterm2b.stress())
                    data['mean'].append(
                        np.mean([
                            preterm1.stress(),
                            preterm2a.stress(),
                            preterm2b.stress()
                        ]))
                data['norm_m1'].append((preterm1.stress() - min1) /
                                       max1 if max1 else np.nan)
                data['norm_m2a'].append((preterm2a.stress() - min2a) /
                                        max2a if max2a else np.nan)
                data['norm_m2b'].append((preterm2b.stress() - min2b) /
                                        max2b if max2b else np.nan)
                data['norm_mean'].append((np.mean([
                    preterm1.stress(),
                    preterm2a.stress(),
                    preterm2b.stress()
                ]) - minmean) / maxmean if maxmean else np.nan)
                data['sidx'].append(i)
                data['sent'].append(sent)
                data['ambig_words'].append(ambig1)
                data['ambig_monosyll'].append(ambig2)
            data['contour'].extend(
                [' '.join(str(x) for x in data['mean'][-(j):])] * j)

        if format_pandas:
            for k, v in data.items():
                data[k] = pd.Series(v)
            df = pd.DataFrame(data,
                              columns=[
                                  'widx', 'norm_widx', 'word', 'seg',
                                  'lexstress', 'nseg', 'nsyll', 'nstress',
                                  'pos', 'dep', 'm1', 'm2a', 'm2b', 'mean',
                                  'norm_m1', 'norm_m2a', 'norm_m2b',
                                  'norm_mean', 'sidx', 'sent', 'ambig_words',
                                  'ambig_monosyll', 'contour'
                              ])
            return df

        keys = list(data.keys())
        old = []
        num_rows = len(data[keys[0]])
        for i_row in range(num_rows):
            dx = {}
            for k in keys:
                dx[k] = data[k][i_row]
            old += [dx]

        return old
예제 #4
0
파일: thesis2.py 프로젝트: divir94/AITED
def genThesis(topic):
    """
    in order to generate the thesis, we need the following:
        - title
        - url
        - rating = opinion
        - thesis: the bold text for the yes or no section 
            ** the thesis has to contain some keywords from the title
        - support: the sentence following the yes or no bold statement
    """

    title, url, category = genTopic(topic)

    # print '\n'
    # print title
    # print '===================================================== \n'

    r = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data)
    cleaned = []

    """ remove stopwords using nltk stop list and print the keywords """
    # stoppers = stopWords()

    keywords = [w for w in title.lower().split() if not w in stopwords.words("english")]
    # print "keywords: ", keywords
    for i in keywords:
        cleaned.append(dePunc(i))
    # print "cleaned: ", cleaned

    # cleaned list of keywords
    keys = ' '.join(cleaned)

    vote = soup.find("span", "no-text").text
    strings = str(vote).split()
    rating = int(strings[0][:-1])  # this is the 'no' rating
    opinions = []
    args = []
    argSplit = []
    clean_keys = []
    data = {}

    """ for each top argument, check if it is long enough and contains more than one word from
    the list of title keywords. """
    if rating > 50:
        # vote is " No"
        args = soup.find('div', attrs={'id': 'no-arguments'}).find_all("li", "hasData")
        # print args
        for i in range(0, len(args)):
            # find the list items
            temp = args[i].find("h2").text
            userArg = args[i].find("p").text
            # print userArg
            tmps = temp.split()
            # print tmps
            for i in tmps:
                clean_keys.append(dePunc(i))

            if len(clean_keys) > 3:
                count = 0
                for i in clean_keys:
                    for j in cleaned:
                        # print stemmer.lemmatize(i).lower()+','+stemmer.lemmatize(j).lower()
                        if stemmer.lemmatize(i).lower() == stemmer.lemmatize(j).lower():
                            count += 1
                            if count > 1:
                                # opinions.append(' '.join(tmps))
                                # data[str(temp).encode('utf8')] = str(userArg).encode('utf8')

                                data[u''.join(temp).encode('utf-8')] = u''.join(userArg).encode('utf-8') #try to solve ASCII error
                            # print count
    else:
        # vote is " Yes"
        args = soup.find('div', attrs={'id': 'yes-arguments'}).find_all("li", "hasData")
        # print args
        for i in range(0, len(args)):
            # find the list items
            temp = args[i].find("h2").text
            userArg = args[i].find("p").text
            # print userArg
            tmps = temp.split()
            # print tmps
            for i in tmps:
                clean_keys.append(dePunc(i))

            if len(clean_keys) > 3:
                # opinions.append(' '.join(tmps))
                # print opinions
                count = 0
                for i in clean_keys:
                    for j in cleaned:
                        # print stemmer.lemmatize(i).lower()+','+stemmer.lemmatize(j).lower()
                        if stemmer.lemmatize(i).lower() == stemmer.lemmatize(j).lower():
                            count += 1
                            if count > 1:
                                # opinions.append(temp)
                                data[temp.encode('utf8')] = userArg.encode('utf8')
                            # print count

    """ form the thesis by taking a random opinion and it's supporting argument """
    # long_support = userArg

    # print data

    # if not opinions: #checking if opinions is empty
    #   print "Couldn't find anything - opinions"
    #   # return "", ""
    # elif not userArg: #checking if userArgs is empty
    #   print "Couldn't find anything - arguments"
    #   # return "", ""
    # else: #if they aren't empty, do this
    #   # print 'Top Argument: '+opinions[0]+'\n'
    #   # topArg = opinions[0].split()
    #   """ send the thesis and userArgs off to the function to be strengthened """
    #   # thesis = opinions[0]+' '+long_support
    #   thesis_stmt = thesis(arg, long_support)
    #   print "Thesis: "
    #   print thesis
    #   # return title, thesis

    if not data:
        # print "Couldn't find anything - data dictionary"
        return "", "", "", ""
    else:  #if they aren't empty, do this
        # print 'Top Argument: '+opinions[0]+'\n'
        # topArg = opinions[0].split()
        """ send the thesis and userArgs off to the function to be strengthened """
        # thesis = opinions[0]+' '+long_support
        # thesis_stmt = improvements(title, data)
        # print "Thesis: "
        # print thesis
        one = random.choice(data.keys())
        two = data[one]

        if two == "":
            return "", "", "", ""
        else:
            punct = ['!', '?']

            tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
            tokens = tokenizer.tokenize(two.strip())
            # for token in tokens:
            #     print token, "\n"
            thesis_len = len(tokens)
            support = ' '.join(tokenizer.tokenize(two.strip()))
            while thesis_len > 5:
                one = random.choice(data.keys())
                two = data[one]
                tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
                thesis_len = len(tokenizer.tokenize(two.strip()))
                support = ' '.join(tokenizer.tokenize(two.strip()))
            # print "support: ", support
            if tokens[0] == one:
                one = ""

            if one[-1] == '.':
                one = one
            elif one[-1] in punct or one[-1] not in punct:
                one = one + '.'
            # print one

            thesis = one + " " + support
            # print title
            # print 'Thesis: ', thesis
            return title, one, support, url
예제 #5
0
def facebook():
    connect = app.config['SOCIALMEDIADATA_COLLECTION']
    user_obj = session.get('user', None)
    hotel = user_obj['hotel']
    place = user_obj['Place']
    data = list(connect.find({'hotel': hotel, 'Place': place}))
    fbkey = data[0]['fbkeyword']
    if fbkey != "":
        url = "https://graph.facebook.com/{0}/?fields=fan_count,talking_about_count,rating_count,checkins,overall_star_rating,feed.limit(100){{name,message,picture,type,link,likes.limit(0).summary(true),shares,comments.limit(0).summary(true)}},picture&access_token={1}".format(
            fbkey, app.config["FB_TOKEN"])
        #r = requests.get(url, headers=headers)
        r = requests.get(url)
        data = json.loads((r.text))
        JSON = []
        total_fb = 0
        total_fb_pos = 0
        total_fb_neg = 0
        total_fb_neu = 0
        if 'error' in data.keys():
            raise abort(500)
        for d in data["feed"]["data"]:
            total_fb = total_fb + 1
            cur = {}
            if "name" in d:
                cur['name'] = str(d['name'])
            else:
                cur['name'] = None
            if "picture" in d:
                cur['picture'] = str(d['picture'])
            else:
                cur['picture'] = None
            if "link" in d:
                cur['link'] = str(d['link'])
            else:
                cur['link'] = None
            if 'message' in d:
                cur['message'] = str(d['message'])
            else:
                cur['message'] = None
            if 'shares' in d:
                cur['shares'] = d['shares']['count']
            else:
                cur['shares'] = 0
            if 'likes' in d:
                cur['total_likes'] = d['likes']['summary']['total_count']
            else:
                cur['total_likes'] = 0
            if 'comments' in d:
                cur['total_comments'] = d['comments']['summary']['total_count']
            else:
                cur['total_comments'] = 0
            if 'message' in d:
                cur['sentiment'] = str(sentiment_analyzer(str(d['message'])))
            else:
                cur['sentiment'] = None
            if cur['sentiment'] == 'Positive':
                total_fb_pos = total_fb_pos + 1
            elif cur['sentiment'] == 'Neutral':
                total_fb_neu = total_fb_neu + 1
            else:
                total_fb_neg = total_fb_neg + 1
            JSON.append({
                'name': cur['name'],
                'picture': cur['picture'],
                "link": cur['link'],
                'message': cur['message'],
                'shares': cur['shares'],
                'total_likes': cur['total_likes'],
                'total_comments': cur['total_comments'],
                'sentiment': cur['sentiment']
            })
        data = [{
            "data": JSON,
            "user_picture": data['picture'],
            "fan_count": data['fan_count'],
            "talking_about_count": data['talking_about_count'],
            "rating_count": data['rating_count'],
            "checkins": data['checkins'],
            "overall_star_rating": data['overall_star_rating'],
            "socialchannel": "Facebook",
            "totalFb": {
                'total_fb': total_fb,
                'total_fb_pos': total_fb_pos,
                'total_fb_neu': total_fb_neu,
                'total_fb_neg': total_fb_neg
            },
            "imgPath": "/static/images/fb.png"
        }]
    else:
        data = [{
            "socialchannel": "Facebook",
            "totalFb": {
                'total_fb': 0,
                'total_fb_pos': 0,
                'total_fb_neu': 0,
                'total_fb_neg': 0
            },
            "fan_count": 0,
            "talking_about_count": 0,
            "checkins": 0,
            "imgPath": "/static/images/fb.png"
        }]
    return data