def find_duplicate_comments(main_list, val): sub_list = main_list[val] channel_id = sub_list[4] comment = sub_list[3] maxv = 0 #count = 0 ct = 0 for rowd in main_list: #print("row is ",rowd)sub_list[6] == rowd[6] #print(sub_list[6]) if val == ct: #print("count ",count) #print("hell") ct +=1 continue else: ct +=1 if channel_id == rowd[4]: #print("my name cdncnj") row_comment = rowd[3] dataset = [comment , row_comment] #print("dta set is",dataset) Ne = len(dataset) processed_text = [] for text in dataset[:Ne]: processed_text.append(word_tokenize(str(preprocess(text))))#this is a list in list which have tokenize strings duplicate words also here vectors_list = vectorize(processed_text) vec1 = vectors_list[0] vec2 = vectors_list[1] dot = sum(a*b for a, b in zip(vec1,vec2)) norm_a = sum(a*a for a in vec1) ** 0.5 norm_b = sum(b*b for b in vec2) ** 0.5 cos_sims = dot / (norm_a*norm_b) #print("cos val ",cos_sims) if maxv <= cos_sims: maxv = cos_sims #print(cos_sims) #count = count + 1 return maxv
def find_duplicate_comments(main_list, val): sub_list = main_list[val] channel_id = sub_list[4] comment = sub_list[3] maxv = 0 #count = 0 ct = 0 for rowd in main_list: if val == ct: ct += 1 continue else: ct += 1 if channel_id == rowd[4]: row_comment = rowd[3] dataset = [comment, row_comment] Ne = len(dataset) processed_text = [] for text in dataset[:Ne]: processed_text.append( word_tokenize(str(preprocess(text))) ) #this is a list in list which have tokenize strings duplicate words also here vectors_list = vectorize(processed_text) vec1 = vectors_list[0] vec2 = vectors_list[1] dot = sum(a * b for a, b in zip(vec1, vec2)) norm_a = sum(a * a for a in vec1)**0.5 norm_b = sum(b * b for b in vec2)**0.5 cos_sims = dot / (norm_a * norm_b) if maxv <= cos_sims: maxv = cos_sims return maxv
def mainScript(): wb1 = openpyxl.load_workbook('contentData.xlsx') wb2 = openpyxl.load_workbook('commentData.xlsx') worksheet1 = wb1.active worksheet2 = wb2.active number_of_round = (worksheet1.max_row + 1) fields_list = [] for x in range(2,number_of_round): # print("round????????????????????????????????????????????????????????????????????????????????????????? ") vid = worksheet1.cell(row=x, column=2).value main_list = [] post_date = "" for row in worksheet2.iter_rows(): if row[0].value == vid: row_list = [] count = 1 for cell in row: row_list.append(worksheet2.cell(row=cell.row, column=count).value) post_date = worksheet2.cell(row=cell.row, column=2).value count +=1 main_list.append(row_list) print('your lis ',main_list) dataset = [] comments_date = [] comment_id = [] for li in main_list: dataset.append(li[3]) comments_date.append(li[8]) comment_id.append(li[2]) content_string = worksheet1.cell(row=x, column=1).value dataset.insert(0,content_string) N = len(dataset) def calculate_time_gap(val): rep1 = post_date.replace("Z", "+00:00") rep2 = val.replace("Z", "+00:00") x = datetime.fromisoformat(rep1).timestamp() y = datetime.fromisoformat(rep2).timestamp() dif = (y-x)/60 # calculate from minitues return dif def generate_ngrams(text): text = re.sub(r'[#$%&()*+-./:;<=>?\s]', '',text) tokens = word_tokenize(text) print(tokens) # ['මචං', 'මැසේජ්', 'වගේ'] list_comment = [] for token in tokens:# take a one word at a time n = len(token) list_word = [] for i in range(0,(n-1)): bi = token[i:i+2] list_word.append(bi) print("word bi grams list ",list_word) list_comment.extend(list_word) print("comment bi grams ",list_comment) return list_comment bi_gram_processed_text = [] #this is a list in list which have bi-gram strings duplicate bi-grams also here words_processed_text = [] # this is a list in list which have words tokens strings duplicate words also here for text in dataset[:N]: text_preprocessed_bi_gram = preprocess(text) # remove stop words and punctuations for make bi grams for find similerity bi_grams = generate_ngrams(text_preprocessed_bi_gram)# make a character bigram list for a comment bi_gram_processed_text.append(bi_grams)# make a list in list of bigrams for each comments words_processed_text.append(word_tokenize(str(remove_punctuation(text)))) vectors_list = vectorize(bi_gram_processed_text) #--------------------------------------------------------------------------------------- center_vector = np.zeros((len(vectors_list[2]))) for numb in range(1,len(vectors_list)): center_vector = center_vector + vectors_list[numb] new_center_vector = center_vector/(len(vectors_list)-1) content_vector = vectors_list[0] for i in range(1, N): cos_simillerity_content_comment = calculate_content_comment_similerity(content_vector,vectors_list[i].tolist()) cos_simillerity_comment_comment = calculate_content_comment_similerity(new_center_vector,vectors_list[i].tolist()) word_count =len(words_processed_text[i]) duplicate_word_ratio = duplicate_words(words_processed_text[i]) no_of_sentences = no_sentences(dataset[i]) length_of_comment = com_length(dataset[i]) num_of_punctuations = no_punctuations(dataset[i]) stop_word_ratio = count_stop_word(words_processed_text[i]) post_coment_gap = calculate_time_gap(comments_date[i-1]) is_black_word = check_black_words_list(words_processed_text[i]) link_mail_pnumber = find_mail_site_pnumber(dataset[i]) comment_duplication = find_duplicate_comments(main_list , i-1) comment_id_val = comment_id[i-1] # ============================================================================= # print(dataset[i]) # print("cos_simillerity_content_comment ",cos_simillerity_content_comment) # print("cos_simillerity_comment_comment ",cos_simillerity_comment_comment) # print("word_count ",word_count) # print("duplicate_word_ratio ",duplicate_word_ratio) # print("no_of_sentences ",no_of_sentences) # print("length_of_comment ",length_of_comment) # print("num_of_punctuations ",num_of_punctuations) # print("stop_word_ratio ",stop_word_ratio) # print("post_coment_gap ",post_coment_gap) # print("is_black_word ",is_black_word) # print("link_mail_pnumber ",link_mail_pnumber) # print("comment_duplication ",comment_duplication) # ============================================================================= # ['BHhl75a5bks', '2021-01-29T14:34:46Z', 'Ugw5ej-XdiFL40uu9814AaABAg', 'හැමදාම unea ඔහොම තමා', 'UCh3O7jnH1dspTmxbkW4K6yA', 'ශ්\u200dරී ලංකා ක්\u200dරිකට් තවත් කැපිල්ලක්..', 0, '2021-01-30T16:12:28Z', '2021-01-30T16:12:28Z'] stringList = makes_csv(comment_id_val, cos_simillerity_content_comment,cos_simillerity_comment_comment,word_count,duplicate_word_ratio,no_of_sentences,length_of_comment,num_of_punctuations,stop_word_ratio,post_coment_gap,is_black_word,link_mail_pnumber,comment_duplication) fields_list.append(stringList) create_file(fields_list) #mainScript()
] # this is a list in list which have words tokens strings duplicate words also here for text in dataset[:N]: text_preprocessed_bi_gram = preprocess( text ) # remove stop words and punctuations for make bi grams for find similerity bi_grams = generate_ngrams( text_preprocessed_bi_gram ) # make a character bigram list for a comment bi_gram_processed_text.append( bi_grams) # make a list in list of bigrams for each comments words_processed_text.append( word_tokenize(str(remove_punctuation(text)))) vectors_list = vectorize(bi_gram_processed_text) #--------------------------------------------------------------------------------------- #center_vector = np.zeros((len(vectors_list[2]))) # initialize for comments center vector dynamic_center_vector = [] for numb in range(1, len(vectors_list)): center_vector = np.zeros( (len(vectors_list[0]))) # initialize for comments center vector count = 0 for i in vectors_list[1:]: count += 1 if count == numb: continue else:
def mainScript(): wb1 = openpyxl.load_workbook('contents.xlsx') wb2 = openpyxl.load_workbook('comments.xlsx') df = pd.read_excel('comments.xlsx') li_id = df['comment_id'].tolist() worksheet1 = wb1.active worksheet2 = wb2.active number_of_round = (worksheet1.max_row + 1) fields_list = [] for x in range(2, number_of_round): # print("round????????????????????????????????????????????????????????????????????????????????????????? ") vid = worksheet1.cell(row=x, column=2).value main_list = [] post_date = "" for row in worksheet2.iter_rows(): if row[0].value == vid: row_list = [] count = 1 for cell in row: row_list.append( worksheet2.cell(row=cell.row, column=count).value) post_date = worksheet2.cell(row=cell.row, column=2).value count += 1 main_list.append(row_list) dataset = [] comments_date = [] #classifier = [] for li in main_list: dataset.append(li[3]) comments_date.append(li[8]) #classifier.append(li[9]) content_string = worksheet1.cell(row=x, column=1).value dataset.insert(0, content_string) N = len(dataset) def calculate_time_gap(val): # rep1 = post_date.replace("Z", "+00:00") #rep2 = val.replace("Z", "+00:00") #x = datetime.fromisoformat(rep1).timestamp() # y = datetime.fromisoformat(rep2).timestamp() #dif = (y-x)/60 # calculate from minitues #return dif obj1 = datetime.strptime(post_date, '%Y-%m-%dT%H:%M:%SZ') obj2 = datetime.strptime(val, '%Y-%m-%dT%H:%M:%SZ') time_delta = (obj2 - obj1) total_seconds = time_delta.total_seconds() #minutes = total_seconds/60 return total_seconds def generate_ngrams(text): text = re.sub(r'[#$%&()*+-./:;<=>?\s]', '', text) tokens = word_tokenize(text) # print(tokens) # ['මචං', 'මැසේජ්', 'වගේ'] list_comment = [] for token in tokens: # take a one word at a time n = len(token) list_word = [] for i in range(0, (n - 1)): bi = token[i:i + 2] list_word.append(bi) #print("word bi grams list ",list_word) list_comment.extend(list_word) #print("comment bi grams ",list_comment) return list_comment # claculate peiods sequence def calc(st): x = '' y = '' for i in st: y = x x = i if x == '.' and y == '.': return 1 #print(i) return 0 bi_gram_processed_text = [ ] #this is a list in list which have bi-gram strings duplicate bi-grams also here words_processed_text = [ ] # this is a list in list which have words tokens strings duplicate words also here for text in dataset[:N]: text_preprocessed_bi_gram = preprocess( text ) # remove stop words and punctuations for make bi grams for find similerity bi_grams = generate_ngrams( text_preprocessed_bi_gram ) # make a character bigram list for a comment bi_gram_processed_text.append( bi_grams) # make a list in list of bigrams for each comments words_processed_text.append( word_tokenize(str(remove_punctuation(text)))) vectors_list = vectorize(bi_gram_processed_text) #--------------------------------------------------------------------------------------- #center_vector = np.zeros((len(vectors_list[2]))) # initialize for comments center vector dynamic_center_vector = [] for numb in range(1, len(vectors_list)): center_vector = np.zeros((len( vectors_list[0]))) # initialize for comments center vector count = 0 for i in vectors_list[1:]: count += 1 if count == numb: continue else: center_vector = np.add(center_vector, i) row_center_vector = center_vector / (len(vectors_list) - 2) dynamic_center_vector.append(row_center_vector) content_vector = vectors_list[0] for i in range(1, N): cos_simillerity_content_comment = calculate_content_comment_similerity( content_vector, vectors_list[i].tolist()) cos_simillerity_comment_comment = calculate_content_comment_similerity( dynamic_center_vector[i - 1], vectors_list[i].tolist()) #word_count =len(words_processed_text[i]) duplicate_word_ratio = duplicate_words(words_processed_text[i]) no_of_sentences = no_sentences(dataset[i]) #length_of_comment = com_length(dataset[i]) num_of_punctuations = no_punctuations(dataset[i]) is_period_sequence = calc(dataset[i]) stop_word_ratio = count_stop_word(words_processed_text[i]) post_coment_gap = calculate_time_gap(comments_date[i - 1]) #is_black_word = check_black_words_list(words_processed_text[i]) #link_mail_pnumber = find_mail_site_pnumber(dataset[i]) comment_duplication = find_duplicate_comments(main_list, i - 1) link_mob_mail_word_length = link_mob_mail_length_blckword( words_processed_text[i]) #classifier_val = classifier[i-1] print(' ') print(dataset[i]) print("cos_simillerity_content_comment ", cos_simillerity_content_comment) print("cos_simillerity_comment_comment ", cos_simillerity_comment_comment) print("word_count ", link_mob_mail_word_length[1]) print("duplicate_word_ratio ", duplicate_word_ratio) print("no_of_sentences ", no_of_sentences) print("length_of_comment ", link_mob_mail_word_length[0]) print("num_of_punctuations ", num_of_punctuations) print("is period sequence ", is_period_sequence) print("stop_word_ratio ", stop_word_ratio) print("post_coment_gap ", post_coment_gap) print("black_word_ratio ", link_mob_mail_word_length[-1]) print("is_link ", link_mob_mail_word_length[2]) print("is_youtube_link ", link_mob_mail_word_length[3]) print("is_number ", link_mob_mail_word_length[4]) #print("is_mail ",link_mob_mail_word_length[5]) print("comment_duplication ", comment_duplication) stringList = makes_csv( li_id[i - 1], cos_simillerity_content_comment, cos_simillerity_comment_comment, link_mob_mail_word_length[1], duplicate_word_ratio, no_of_sentences, link_mob_mail_word_length[0], num_of_punctuations, is_period_sequence, stop_word_ratio, post_coment_gap, link_mob_mail_word_length[-1], link_mob_mail_word_length[2], link_mob_mail_word_length[3], link_mob_mail_word_length[4], comment_duplication) fields_list.append(stringList) create_file(fields_list)
def mainScript(): wb1 = openpyxl.load_workbook('contents.xlsx') wb2 = openpyxl.load_workbook('comments.xlsx') df = pd.read_excel('comments.xlsx') li_id = df['comment_id'].tolist() worksheet1 = wb1.active worksheet2 = wb2.active number_of_round = (worksheet1.max_row + 1) fields_list = [] for x in range(2,number_of_round): vid = worksheet1.cell(row=2, column=2).value main_list = [] post_date = "" for row in worksheet2.iter_rows(): if row[0].value == vid: row_list = [] count = 1 for cell in row: row_list.append(worksheet2.cell(row=cell.row, column=count).value) post_date = worksheet2.cell(row=cell.row, column=2).value count +=1 main_list.append(row_list) dataset = [] comments_date = [] for li in main_list: dataset.append(li[3]) comments_date.append(li[8]) content_string = worksheet1.cell(row=2, column=1).value dataset.insert(0,content_string) N = len(dataset) def de_emojies(text1): regrex_pattern = re.compile(pattern = "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags = re.UNICODE) text = regrex_pattern.sub(r' ඉමොජි ',text1) return text new_dataset = [] def remove_emojies(dataset): d_list = dataset for one_list in d_list: clean_text = de_emojies(one_list) new_dataset.append(clean_text) remove_emojies(dataset) def calculate_time_gap(val): obj1 = datetime.strptime(post_date,'%Y-%m-%dT%H:%M:%SZ') obj2 = datetime.strptime(val,'%Y-%m-%dT%H:%M:%SZ') time_delta = (obj2-obj1) total_seconds = time_delta.total_seconds() #minutes = total_seconds/60 return total_seconds def generate_ngrams(text): text = re.sub(r'[#$%&()*+-./:;<=>?\s]', '',text) tokens = word_tokenize(text) # print(tokens) # ['මචං', 'මැසේජ්', 'වගේ'] list_comment = [] for token in tokens:# take a one word at a time n = len(token) list_word = [] for i in range(0,(n-1)): bi = token[i:i+2] list_word.append(bi) list_comment.extend(list_word) return list_comment # claculate peiods sequence def calc(st): x = '' y = '' for i in st: y = x x = i if x=='.'and y=='.': return 1 return 0 bi_gram_processed_text = [] #this is a list in list which have bi-gram strings duplicate bi-grams also here words_processed_text = [] # this is a list in list which have words tokens strings duplicate words also here for text in new_dataset[:N]: text_preprocessed_bi_gram = preprocess(text) # remove stop words and punctuations for make bi grams for find similerity bi_grams = generate_ngrams(text_preprocessed_bi_gram)# make a character bigram list for a comment bi_gram_processed_text.append(bi_grams)# make a list in list of bigrams for each comments words_processed_text.append(word_tokenize(str(remove_punctuation(text)))) vectors_list = vectorize(bi_gram_processed_text) #center_vector = np.zeros((len(vectors_list[2]))) # initialize for comments center vector dynamic_center_vector = [] for numb in range(1,len(vectors_list)): center_vector = np.zeros((len(vectors_list[0]))) # initialize for comments center vector count = 0 for i in vectors_list[1:]: count +=1 if count == numb: continue else: center_vector = np.add(center_vector,i) row_center_vector = center_vector/(len(vectors_list)-2) dynamic_center_vector.append(row_center_vector) content_vector = vectors_list[0] for i in range(1, N): cos_simillerity_content_comment = calculate_content_comment_similerity(content_vector,vectors_list[i].tolist()) cos_simillerity_comment_comment = calculate_content_comment_similerity(dynamic_center_vector[i-1],vectors_list[i].tolist()) duplicate_word_ratio = duplicate_words(words_processed_text[i]) no_of_sentences = no_sentences(new_dataset[i]) num_of_punctuations = no_punctuations(new_dataset[i]) is_period_sequence = calc(new_dataset[i]) post_coment_gap = calculate_time_gap(comments_date[i-1]) comment_duplication = find_duplicate_comments(main_list , i-1) link_mob_mail_word_length = link_mob_mail_length_blckword(words_processed_text[i]) stringList = makes_csv(li_id[i-1], cos_simillerity_content_comment,cos_simillerity_comment_comment,link_mob_mail_word_length[1],duplicate_word_ratio,no_of_sentences,link_mob_mail_word_length[0],num_of_punctuations,is_period_sequence,post_coment_gap,link_mob_mail_word_length[-1],link_mob_mail_word_length[2],link_mob_mail_word_length[3],link_mob_mail_word_length[4],comment_duplication) fields_list.append(stringList) create_file(fields_list) #mainScript()