def FuzzyWazzy_SimilarityOverAll(Country, gallery_id): S, Data = Load_GalLery_Textual_Data(Country, gallery_id) labels, Data1 = Load_GoogleVision_Labels(Country, gallery_id) setA = list(set([x.lower() for x in labels])) setB = get_entities(S) if len(setB) == 0: return 0.0 overlap = 0 for l in setA: for w in setB: if fuzz.ratio(l, w) >= 75: overlap += 1 Similarity = round(float(overlap) / len(setA) * 100., 2) # print ('overlap = ',overlap) # print ('Labels = ',len(setA)) # print ('Comments = ',len(setB)) # print ('overlap(Labels,Comments)/Labels = ',Similarity) return Similarity
def Envy_Sentiments(): Galeries_Matrix = np.array(galeries).reshape(len(Countries),10) LuxuryList = [item for sublist in Luxury for item in sublist] PSentiments = [] NSentiments = [] PComments = [] NComments = [] i = 0 for Country in Countries: print(str(i+1) + ' : ' + Country) for j in range (10): if Galeries_Matrix[i,j] in LuxuryList: Comments,Data = Load_GalLery_Textual_Data(Country, Galeries_Matrix[i,j]) S = round(mean([float(i) for i in Senti_List(Comments)]),2) if S >= 0: PSentiments.append(S) PComments.append(len(Comments)) else: NSentiments.append(S) NComments.append(len(Comments)) i+=1 return PSentiments,PComments,NSentiments,NComments
def Luxury_vs_NonLuxury(Sentiment=False): Galeries_Matrix = np.array(galeries).reshape(len(Countries), 10) LuxuryList = [item for sublist in Luxury for item in sublist] NbComments = [] Groups = [] Sentiments = [] i = 0 for Country in Countries: #print(str(i+1) + ' : ' + Country) for j in range(10): Comments, Data = Load_GalLery_Textual_Data(Country, Galeries_Matrix[i, j]) NbComments.append(len(Comments)) if Galeries_Matrix[i, j] in LuxuryList: Groups.append('Luxary') else: Groups.append('NonLuxuary') if Sentiment: Sentiments.append(Senti_List(Comments)) i += 1 if Sentiment: return Groups, NbComments, Sentiments else: return Groups, NbComments
def keyWords_Labels_Matching(Country,gallery_id): DocList ,Data = Load_GalLery_Textual_Data(Country,gallery_id) S1 ,Data1 = Load_GoogleVision_Labels(Country,gallery_id) data_lemmatized = [w for doc in PrepareData(DocList) for w in doc] print (data_lemmatized) fullStr = ' '.join(data_lemmatized) #labels = [Preprocessing(x['label']) for x in S1[0]] #labels.append(Preprocessing(S1[1])) labels = [w for label in PrepareData(S1) for w in label] setA = list(set(labels)) setB = keywords(fullStr).split('\n') setB = [w for docs in PrepareData(setB) for w in docs] overlap = 0 for l in setA: for w in setB: if fuzz.ratio(l, w) >= 75: overlap += 1 universe = [] uni = list(set(setA) | set(setB)) for i in range(len(uni)): if uni[i] not in universe: universe.append(uni[i]) for j in range(i+1,len(uni)): if fuzz.ratio(uni[i], uni[j]) >= 75 and uni[j] not in universe: universe.append(uni[j]) universe = len(universe) labels = round(float(overlap) / len(setA) * 100., 2) comments = round(float(overlap) / len(setB) * 100., 2) overall = round(float(overlap) / float(universe) * 100., 2) #print ('overlap = ',overlap) #print ('universe = ',universe) #print ('\nLabels = ',len(setA)) #print ('Comments = ',len(setB)) #print ('overlap(Labels,Comments)/Labels = ',labels) #print ('overlap(Labels,Comments)/Comments = ',comments) print ('overlap(Labels,Comments)/Universe(Labels,Comments) = ',overall) return labels,comments,overall,setA,setB
def Number_of_Comments(): Galeries_Matrix = np.array(galeries).reshape(len(Countries),10) NbComments = [] i = 0 for Country in Countries: for j in range (10): Comments,Data = Load_GalLery_Textual_Data(Country, Galeries_Matrix[i,j]) NbComments.append(len(Comments)) i+=1 return NbComments
def Sentiments_Analysis(): Galeries_Matrix = np.array(galeries).reshape(len(Countries),10) Sentiments = [] NbComments = [] i = 0 for Country in Countries: print(str(i+1) + ' : ' + Country) for j in range (10): Comments,Data = Load_GalLery_Textual_Data(Country, Galeries_Matrix[i,j]) Sentiments.append(round(mean([float(i) for i in Senti_List(Comments)]),2)) NbComments.append(len(Comments)) i+=1 return Sentiments,NbComments
def Luxury_vs_users(): Galeries_Matrix = np.array(galeries).reshape(len(Countries), 10) NbComments = [] i = 0 for Country in Countries: print(str(i + 1) + ' : ' + Country) for j in range(10): Labels, jData = Load_Google_Labels(Country, Galeries_Matrix[i, j]) for label in Labels: if label in Luxurykeys: Comments, Data = Load_GalLery_Textual_Data( Country, Galeries_Matrix[i, j]) NbComments.append(len(Comments)) i += 1 return NbComments
def Statistique(): Galeries_Matrix = np.array(galeries).reshape(len(Countries),10) Countries_Comments = {} Comments_word_Nb = {} #Comments_char_Nb = {} Countries_emogi = {} Countries_URLS = {} Countries_Mentions = {} Countries_Symbols = {} i = 0 for Country in Countries: NB_Comments = [] NB_W_Comments = [] NB_emogi = [] NB_URLS = [] NB_Mentions = [] NB_Symbols = [] print(str(i+1) + ' : ' + Country) for j in range (10): Comments,Data = Load_GalLery_Textual_Data(Country, Galeries_Matrix[i,j]) NB_Comments.append(len(Comments)) for Comment in Comments: emoji_counter, words_counter,urls_counter,Mentions_counter,Symbols_counter = split_count(Comment) NB_W_Comments.append(words_counter) NB_emogi.append(emoji_counter) NB_URLS.append(urls_counter) NB_Mentions.append(Mentions_counter) NB_Symbols.append(Symbols_counter) Comments_word_Nb[Country] = NB_W_Comments Countries_Comments[Country] = NB_Comments Countries_emogi[Country] = NB_emogi Countries_URLS[Country] = NB_URLS Countries_Mentions[Country] = NB_Mentions Countries_Symbols[Country] = NB_Symbols i+=1 return Countries_Comments,Comments_word_Nb,Countries_emogi,Countries_URLS,Countries_Mentions,Countries_Symbols
def NaturePics_Vs_Comments(): Galeries_Matrix = np.array(galeries).reshape(len(Countries), 10) NatureList = [item for sublist in Nature for item in sublist] NaturePics = {} NoNaturePics = {} i = 0 for Country in Countries: print(str(i + 1) + ' : ' + Country) for j in range(10): Comments, Data = Load_GalLery_Textual_Data(Country, Galeries_Matrix[i, j]) if Galeries_Matrix[i, j] in NatureList: NaturePics[Galeries_Matrix[i, j]] = len(Comments) else: NoNaturePics[Galeries_Matrix[i, j]] = len(Comments) i += 1 return NaturePics, NoNaturePics
def LoadTextData(Country,gallery_id): S ,Data = Load_GalLery_Textual_Data(Country,gallery_id) S1 ,Data1 = Load_GoogleVision_Labels(Country,gallery_id) labels = [Preprocessing(x['label']) for x in S1[0]] labels.append(Preprocessing(S1[1])) DocList = S[1] DocList.append(S[0]) for s in S[2]: DocList.extend(s) data_lemmatized = PrepareData(DocList) lda_model,id2word,corpus = LDA(data_lemmatized,num_topics=20)#len(labels)) Topic_Words = Topics_Words(lda_model,num_words=len(labels)) return Topic_Words,labels
labels,comments,overall = keyWords_Labels_Matching(Country,Galeries_Matrix[i,j]) Slabels.append(labels) Scomments.append(comments) Soverall.append(overall) Similarities['labels'] = Slabels Similarities['comments'] = Scomments Similarities['overall'] = Soverall with open('LDA Similarities/'+Country+'.json', 'w') as outfile: json.dump(Similarities, outfile) #break i+=1 def Histogramme(Country): with open('LDA Similarities/'+Country+'.json') as data_file: Data = json.load(data_file) #plt.hist(Data['overall']) x = np.arange(10) plt.bar(x, Data['labels']) plt.xticks(x+.2, x) #OverAll_Text_Similarity_DataSet() #Histogramme('Algeria') #labels,comments,overall,setA,setB = keyWords_Labels_Matching('Algeria','x6TwpSQ') S ,Data = Load_GalLery_Textual_Data('Algeria','x6TwpSQ') #S1 ,Data1 = Load_GoogleVision_Labels('Algeria','x6TwpSQ')