def main(): id = 1 for topic in TOPICS: path = PATH + topic directory = os.fsencode(path) for file in os.listdir(directory): filename = os.fsdecode(file) if filename.endswith(".json"): data = json.load(open(path + '/' + filename)) data.pop('localpath', None) data.pop('date_download', None) data.pop('date_modify', None) data.pop('filename', None) data["id"] = id id += 1 for content in CONTENTS: if content in data.keys() and data[content] != None: new_content = word_tokenize(data[content]) data[content] = new_content with open(path + '/' + filename, 'w') as outfile: json.dump(data, outfile)
def get_stats(self, generator, arto=False,format_pandas=False): """""" data = defaultdict(list) i = 0 for t in generator: i += 1 ambig1 = t.ambiguity(stress_polysyll=False) ambig2 = t.ambiguity(stress_polysyll=True) tree1 = t.max_stress_disambiguate()[0] tree1.set_pstress() tree1.set_stress() tree2a = t.min_stress_disambiguate(stress_polysyll=True)[0] tree2a.set_pstress() tree2a.set_stress() tree2b = t.min_stress_disambiguate(stress_polysyll=False)[0] tree2b.set_pstress() tree2b.set_stress() j = 0 preterms1 = list(tree1.preterminals()) min1 = float(min([preterm.stress() for preterm in preterms1 if not np.isnan(preterm.stress())])) max1 = max([preterm.stress() for preterm in preterms1 if not np.isnan(preterm.stress())]) - min1 preterms2a = list(tree2a.preterminals()) min2a = float(min([preterm.stress() for preterm in preterms2a if not np.isnan(preterm.stress())])) max2a = max([preterm.stress() for preterm in preterms2a if not np.isnan(preterm.stress())]) - min2a preterms2b = list(tree2b.preterminals()) min2b = float(min([preterm.stress() for preterm in preterms2b if not np.isnan(preterm.stress())])) max2b = max([preterm.stress() for preterm in preterms2b if not np.isnan(preterm.stress())]) - min2b preterms_raw = list(t.preterminals()) minmean = float(min([np.mean([preterm1.stress(), preterm2a.stress(), preterm2b.stress()]) for preterm1, preterm2a, preterm2b in zip(preterms1, preterms2a, preterms2b) if not np.isnan(preterm1.stress())])) maxmean = max([np.mean([preterm1.stress(), preterm2a.stress(), preterm2b.stress()]) for preterm1, preterm2a, preterm2b in zip(preterms1, preterms2a, preterms2b) if not np.isnan(preterm1.stress())]) - minmean sent = ' '.join([preterm[0] for preterm in preterms_raw]) sentlen = len(preterms_raw) for preterm1, preterm2a, preterm2b, preterm_raw in zip(preterms1, preterms2a, preterms2b, preterms_raw): j += 1 data['widx'].append(j) data['norm_widx'].append(float(j) / sentlen if sentlen else np.nan) data['word'].append(preterm1[0]) if preterm_raw._lstress == 0: data['lexstress'].append('yes') elif preterm_raw._lstress == -.5: data['lexstress'].append('ambig') elif preterm_raw._lstress == -1: data['lexstress'].append('no') else: data['lexstress'].append('???') data['seg'].append(' '.join(preterm1.seg())) data['nseg'].append(preterm1.nseg()) data['nsyll'].append(preterm1.nsyll()) data['nstress'].append(preterm1.nstress()) data['pos'].append(preterm1.category()) data['dep'].append(preterm1.dependency()) if arto: data['m1'].append(-(preterm1.stress()-1)) data['m2a'].append(-(preterm2a.stress()-1)) data['m2b'].append(-(preterm2b.stress()-1)) data['mean'].append(-(np.mean([preterm1.stress(), preterm2a.stress(), preterm2b.stress()])-1)) else: data['m1'].append(preterm1.stress()) data['m2a'].append(preterm2a.stress()) data['m2b'].append(preterm2b.stress()) data['mean'].append(np.mean([preterm1.stress(), preterm2a.stress(), preterm2b.stress()])) data['norm_m1'].append((preterm1.stress()-min1)/max1 if max1 else np.nan) data['norm_m2a'].append((preterm2a.stress()-min2a)/max2a if max2a else np.nan) data['norm_m2b'].append((preterm2b.stress()-min2b)/max2b if max2b else np.nan) data['norm_mean'].append((np.mean([preterm1.stress(), preterm2a.stress(), preterm2b.stress()])-minmean)/maxmean if maxmean else np.nan) data['sidx'].append(i) data['sent'].append(sent) data['ambig_words'].append(ambig1) data['ambig_monosyll'].append(ambig2) data['contour'].extend([' '.join(str(x) for x in data['mean'][-(j):])]*j) if format_pandas: for k, v in data.iteritems(): data[k] = pd.Series(v) df=pd.DataFrame(data, columns=['widx', 'norm_widx', 'word', 'seg', 'lexstress', 'nseg', 'nsyll', 'nstress', 'pos', 'dep', 'm1', 'm2a', 'm2b', 'mean', 'norm_m1', 'norm_m2a', 'norm_m2b', 'norm_mean', 'sidx', 'sent', 'ambig_words', 'ambig_monosyll', 'contour']) return df keys=data.keys() old=[] num_rows=len(data[keys[0]]) for i_row in range(num_rows): dx={} for k in keys: dx[k]=data[k][i_row] old+=[dx] return old
def get_stats(self, generator, arto=False, format_pandas=False): """""" data = defaultdict(list) i = 0 for t in generator: i += 1 ambig1 = t.ambiguity(stress_polysyll=False) ambig2 = t.ambiguity(stress_polysyll=True) tree1 = t.max_stress_disambiguate()[0] tree1.set_pstress() tree1.set_stress() tree2a = t.min_stress_disambiguate(stress_polysyll=True)[0] tree2a.set_pstress() tree2a.set_stress() tree2b = t.min_stress_disambiguate(stress_polysyll=False)[0] tree2b.set_pstress() tree2b.set_stress() j = 0 preterms1 = list(tree1.preterminals()) min1 = float( min([ preterm.stress() for preterm in preterms1 if not np.isnan(preterm.stress()) ])) max1 = max([ preterm.stress() for preterm in preterms1 if not np.isnan(preterm.stress()) ]) - min1 preterms2a = list(tree2a.preterminals()) min2a = float( min([ preterm.stress() for preterm in preterms2a if not np.isnan(preterm.stress()) ])) max2a = max([ preterm.stress() for preterm in preterms2a if not np.isnan(preterm.stress()) ]) - min2a preterms2b = list(tree2b.preterminals()) min2b = float( min([ preterm.stress() for preterm in preterms2b if not np.isnan(preterm.stress()) ])) max2b = max([ preterm.stress() for preterm in preterms2b if not np.isnan(preterm.stress()) ]) - min2b preterms_raw = list(t.preterminals()) minmean = float( min([ np.mean([ preterm1.stress(), preterm2a.stress(), preterm2b.stress() ]) for preterm1, preterm2a, preterm2b in zip( preterms1, preterms2a, preterms2b) if not np.isnan(preterm1.stress()) ])) maxmean = max([ np.mean([ preterm1.stress(), preterm2a.stress(), preterm2b.stress() ]) for preterm1, preterm2a, preterm2b in zip( preterms1, preterms2a, preterms2b) if not np.isnan(preterm1.stress()) ]) - minmean sent = ' '.join([preterm[0] for preterm in preterms_raw]) sentlen = len(preterms_raw) for preterm1, preterm2a, preterm2b, preterm_raw in zip( preterms1, preterms2a, preterms2b, preterms_raw): j += 1 data['widx'].append(j) data['norm_widx'].append( float(j) / sentlen if sentlen else np.nan) data['word'].append(preterm1[0]) if preterm_raw._lstress == 0: data['lexstress'].append('yes') elif preterm_raw._lstress == -.5: data['lexstress'].append('ambig') elif preterm_raw._lstress == -1: data['lexstress'].append('no') else: data['lexstress'].append('???') data['seg'].append(' '.join(preterm1.seg())) data['nseg'].append(preterm1.nseg()) data['nsyll'].append(preterm1.nsyll()) data['nstress'].append(preterm1.nstress()) data['pos'].append(preterm1.category()) data['dep'].append(preterm1.dependency()) if arto: data['m1'].append(-(preterm1.stress() - 1)) data['m2a'].append(-(preterm2a.stress() - 1)) data['m2b'].append(-(preterm2b.stress() - 1)) data['mean'].append(-(np.mean([ preterm1.stress(), preterm2a.stress(), preterm2b.stress() ]) - 1)) else: data['m1'].append(preterm1.stress()) data['m2a'].append(preterm2a.stress()) data['m2b'].append(preterm2b.stress()) data['mean'].append( np.mean([ preterm1.stress(), preterm2a.stress(), preterm2b.stress() ])) data['norm_m1'].append((preterm1.stress() - min1) / max1 if max1 else np.nan) data['norm_m2a'].append((preterm2a.stress() - min2a) / max2a if max2a else np.nan) data['norm_m2b'].append((preterm2b.stress() - min2b) / max2b if max2b else np.nan) data['norm_mean'].append((np.mean([ preterm1.stress(), preterm2a.stress(), preterm2b.stress() ]) - minmean) / maxmean if maxmean else np.nan) data['sidx'].append(i) data['sent'].append(sent) data['ambig_words'].append(ambig1) data['ambig_monosyll'].append(ambig2) data['contour'].extend( [' '.join(str(x) for x in data['mean'][-(j):])] * j) if format_pandas: for k, v in data.items(): data[k] = pd.Series(v) df = pd.DataFrame(data, columns=[ 'widx', 'norm_widx', 'word', 'seg', 'lexstress', 'nseg', 'nsyll', 'nstress', 'pos', 'dep', 'm1', 'm2a', 'm2b', 'mean', 'norm_m1', 'norm_m2a', 'norm_m2b', 'norm_mean', 'sidx', 'sent', 'ambig_words', 'ambig_monosyll', 'contour' ]) return df keys = list(data.keys()) old = [] num_rows = len(data[keys[0]]) for i_row in range(num_rows): dx = {} for k in keys: dx[k] = data[k][i_row] old += [dx] return old
def genThesis(topic): """ in order to generate the thesis, we need the following: - title - url - rating = opinion - thesis: the bold text for the yes or no section ** the thesis has to contain some keywords from the title - support: the sentence following the yes or no bold statement """ title, url, category = genTopic(topic) # print '\n' # print title # print '===================================================== \n' r = requests.get(url) data = r.text soup = BeautifulSoup(data) cleaned = [] """ remove stopwords using nltk stop list and print the keywords """ # stoppers = stopWords() keywords = [w for w in title.lower().split() if not w in stopwords.words("english")] # print "keywords: ", keywords for i in keywords: cleaned.append(dePunc(i)) # print "cleaned: ", cleaned # cleaned list of keywords keys = ' '.join(cleaned) vote = soup.find("span", "no-text").text strings = str(vote).split() rating = int(strings[0][:-1]) # this is the 'no' rating opinions = [] args = [] argSplit = [] clean_keys = [] data = {} """ for each top argument, check if it is long enough and contains more than one word from the list of title keywords. """ if rating > 50: # vote is " No" args = soup.find('div', attrs={'id': 'no-arguments'}).find_all("li", "hasData") # print args for i in range(0, len(args)): # find the list items temp = args[i].find("h2").text userArg = args[i].find("p").text # print userArg tmps = temp.split() # print tmps for i in tmps: clean_keys.append(dePunc(i)) if len(clean_keys) > 3: count = 0 for i in clean_keys: for j in cleaned: # print stemmer.lemmatize(i).lower()+','+stemmer.lemmatize(j).lower() if stemmer.lemmatize(i).lower() == stemmer.lemmatize(j).lower(): count += 1 if count > 1: # opinions.append(' '.join(tmps)) # data[str(temp).encode('utf8')] = str(userArg).encode('utf8') data[u''.join(temp).encode('utf-8')] = u''.join(userArg).encode('utf-8') #try to solve ASCII error # print count else: # vote is " Yes" args = soup.find('div', attrs={'id': 'yes-arguments'}).find_all("li", "hasData") # print args for i in range(0, len(args)): # find the list items temp = args[i].find("h2").text userArg = args[i].find("p").text # print userArg tmps = temp.split() # print tmps for i in tmps: clean_keys.append(dePunc(i)) if len(clean_keys) > 3: # opinions.append(' '.join(tmps)) # print opinions count = 0 for i in clean_keys: for j in cleaned: # print stemmer.lemmatize(i).lower()+','+stemmer.lemmatize(j).lower() if stemmer.lemmatize(i).lower() == stemmer.lemmatize(j).lower(): count += 1 if count > 1: # opinions.append(temp) data[temp.encode('utf8')] = userArg.encode('utf8') # print count """ form the thesis by taking a random opinion and it's supporting argument """ # long_support = userArg # print data # if not opinions: #checking if opinions is empty # print "Couldn't find anything - opinions" # # return "", "" # elif not userArg: #checking if userArgs is empty # print "Couldn't find anything - arguments" # # return "", "" # else: #if they aren't empty, do this # # print 'Top Argument: '+opinions[0]+'\n' # # topArg = opinions[0].split() # """ send the thesis and userArgs off to the function to be strengthened """ # # thesis = opinions[0]+' '+long_support # thesis_stmt = thesis(arg, long_support) # print "Thesis: " # print thesis # # return title, thesis if not data: # print "Couldn't find anything - data dictionary" return "", "", "", "" else: #if they aren't empty, do this # print 'Top Argument: '+opinions[0]+'\n' # topArg = opinions[0].split() """ send the thesis and userArgs off to the function to be strengthened """ # thesis = opinions[0]+' '+long_support # thesis_stmt = improvements(title, data) # print "Thesis: " # print thesis one = random.choice(data.keys()) two = data[one] if two == "": return "", "", "", "" else: punct = ['!', '?'] tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') tokens = tokenizer.tokenize(two.strip()) # for token in tokens: # print token, "\n" thesis_len = len(tokens) support = ' '.join(tokenizer.tokenize(two.strip())) while thesis_len > 5: one = random.choice(data.keys()) two = data[one] tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') thesis_len = len(tokenizer.tokenize(two.strip())) support = ' '.join(tokenizer.tokenize(two.strip())) # print "support: ", support if tokens[0] == one: one = "" if one[-1] == '.': one = one elif one[-1] in punct or one[-1] not in punct: one = one + '.' # print one thesis = one + " " + support # print title # print 'Thesis: ', thesis return title, one, support, url
def facebook(): connect = app.config['SOCIALMEDIADATA_COLLECTION'] user_obj = session.get('user', None) hotel = user_obj['hotel'] place = user_obj['Place'] data = list(connect.find({'hotel': hotel, 'Place': place})) fbkey = data[0]['fbkeyword'] if fbkey != "": url = "https://graph.facebook.com/{0}/?fields=fan_count,talking_about_count,rating_count,checkins,overall_star_rating,feed.limit(100){{name,message,picture,type,link,likes.limit(0).summary(true),shares,comments.limit(0).summary(true)}},picture&access_token={1}".format( fbkey, app.config["FB_TOKEN"]) #r = requests.get(url, headers=headers) r = requests.get(url) data = json.loads((r.text)) JSON = [] total_fb = 0 total_fb_pos = 0 total_fb_neg = 0 total_fb_neu = 0 if 'error' in data.keys(): raise abort(500) for d in data["feed"]["data"]: total_fb = total_fb + 1 cur = {} if "name" in d: cur['name'] = str(d['name']) else: cur['name'] = None if "picture" in d: cur['picture'] = str(d['picture']) else: cur['picture'] = None if "link" in d: cur['link'] = str(d['link']) else: cur['link'] = None if 'message' in d: cur['message'] = str(d['message']) else: cur['message'] = None if 'shares' in d: cur['shares'] = d['shares']['count'] else: cur['shares'] = 0 if 'likes' in d: cur['total_likes'] = d['likes']['summary']['total_count'] else: cur['total_likes'] = 0 if 'comments' in d: cur['total_comments'] = d['comments']['summary']['total_count'] else: cur['total_comments'] = 0 if 'message' in d: cur['sentiment'] = str(sentiment_analyzer(str(d['message']))) else: cur['sentiment'] = None if cur['sentiment'] == 'Positive': total_fb_pos = total_fb_pos + 1 elif cur['sentiment'] == 'Neutral': total_fb_neu = total_fb_neu + 1 else: total_fb_neg = total_fb_neg + 1 JSON.append({ 'name': cur['name'], 'picture': cur['picture'], "link": cur['link'], 'message': cur['message'], 'shares': cur['shares'], 'total_likes': cur['total_likes'], 'total_comments': cur['total_comments'], 'sentiment': cur['sentiment'] }) data = [{ "data": JSON, "user_picture": data['picture'], "fan_count": data['fan_count'], "talking_about_count": data['talking_about_count'], "rating_count": data['rating_count'], "checkins": data['checkins'], "overall_star_rating": data['overall_star_rating'], "socialchannel": "Facebook", "totalFb": { 'total_fb': total_fb, 'total_fb_pos': total_fb_pos, 'total_fb_neu': total_fb_neu, 'total_fb_neg': total_fb_neg }, "imgPath": "/static/images/fb.png" }] else: data = [{ "socialchannel": "Facebook", "totalFb": { 'total_fb': 0, 'total_fb_pos': 0, 'total_fb_neu': 0, 'total_fb_neg': 0 }, "fan_count": 0, "talking_about_count": 0, "checkins": 0, "imgPath": "/static/images/fb.png" }] return data