def __init__(self, lyrics_file, artist_name): """Reads previously downloaded json and processes its content to a dataframe""" df = pd.read_json(lyrics_file) df.rename(columns={"name": "album"}, inplace=True) df["year"] = pd.to_numeric(df["year"], downcast="integer") df = df.dropna() df["decade"] = df["year"] - df["year"] % 10 df["decade"] = df["decade"].astype("int") df.drop("URL", axis=1, inplace=True) df.sort_values(["year", "song"], inplace=True) df.reset_index(drop=True, inplace=True) df.drop_duplicates(subset="lyrics", keep="first", inplace=True) df["song"] = df["song"].str.replace( r"(^{0} - )|( Lyrics$)".format(artist_name.title()), "") df["lyrics"] = df["lyrics"].str.replace("won't", "will not") df["lyrics"] = df["lyrics"].str.replace("can't", "can not") df["lyrics"] = df["lyrics"].str.replace("n't", " not") df["lyrics"] = df["lyrics"].str.replace("'m", " am") df["lyrics"] = df["lyrics"].str.replace("'re", " are") df["lyrics"] = df["lyrics"].str.replace("'ll", " will") df["lyrics"] = df["lyrics"].str.replace("'s", " is") df["lyrics"] = df["lyrics"].str.replace("'ve", " have") df["lyrics"] = df["lyrics"].str.replace(r"(\w+\s)\1+", r"\1") df["lyrics"] = df["lyrics"].str.replace(r"(\s\w+)\1+", r"\1") self.data = df self.data["word_count"] = self.data.lyrics.apply( lambda x: len(list(tokenize(remove_stopwords(x), lower=True)))) self.data["unique_words"] = self.data.lyrics.apply( lambda x: len(set(list(tokenize(remove_stopwords(x), lower=True)))))
def prepare_index(doc_path): """ Presist dictionary, corpus, and index into disk So they can be reused later on """ with open(doc_path) as input_f: file_name, _ = os.path.splitext(doc_path) raw_syllabus = input_f.read().replace("\n", "") documents = [remove_stopwords(raw_syllabus)] texts = [[ word for word in document.lower().split() if word not in STOP_LIST ] for document in documents] frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(texts) dictionary.save("{}.dict".format(file_name)) corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize("{}.mm".format(file_name), corpus) lsi = LsiModel(corpus, id2word=dictionary, num_topics=1) index = similarities.MatrixSimilarity(lsi[corpus]) index.save("{}.index".format(file_name))
def clean_data(text): text = re.sub('@[\w]*', '', text) # remove @user text = re.sub('&','',text) # remove & text = re.sub('[?!.;:,,#@-]', '', text) # remove special characters text = re.sub(r'[^\x00-\x7F]+', '', text) # remove Unicode characters text = text.replace("[^A-Za-z]", "") # Replace everything except alphabets text = text.lower() # make everything lowercase for uniformity # removing stop-words eg. 'we', 'our', 'ours', 'ourselves', 'just', 'don', "don't", 'should' text = remove_stopwords(text) return text
def clean_data(text): text = re.sub('@[\w]*', '', text) # remove @user text = re.sub('&','',text) # remove & text = re.sub('[?!.;:,,#@-]', '', text) # remove special characters text = re.sub(r'[^\x00-\x7F]+', '', text) # remove Unicode characters text = text.replace("[^A-Za-z#]", "") # Replace everything except alphabets and hash text = text.lower() # make everything lowercase for uniformity # removing short words which are of length 3 or lower(eg. hmm, oh) since they dont add any value text = " ".join(w for w in text.split() if len(w)>3) # removing stop-words eg. 'we', 'our', 'ours', 'ourselves', 'just', 'don', "don't", 'should' text = remove_stopwords(text) return text
def compute_collection_frequency(query_dict, doc_dict): for qno, query in query_dict.items(): for qterm in remove_stopwords(query).split(): term_count = 0 qterm_stem = stemmer.stem(qterm.lower().strip()) # print('qterm : ', qterm_stem) for docid, doc in doc_dict.items(): for term in doc.split(): if term == qterm_stem: term_count += 1 # print('term count : ', qterm_stem, '\t', term_count) collection_freq_dict[qterm_stem] = term_count return collection_freq_dict
def jaccard_similarity(initial_dict, var_dict): stemmer = PorterStemmer() for qid in initial_dict: similarity_list = [] initial = remove_stopwords(initial_dict[qid]) initial_stem = stemmer.stem(initial.lower().strip()) initial_set = set(initial_stem.split()) # print("initial set : ", initial_set) variant_list = var_dict[qid] # print("var list : ", variant_list) for var in variant_list: # print("one var : ", var) variant = remove_stopwords(var) variant_stem = stemmer.stem(variant.lower().strip()) variant_set = set(variant_stem.split()) # print("var set : ", variant_set) intersec = initial_set.intersection(variant_set) # print("intersection : ", intersec) similarity = round( float(len(intersec)) / (len(initial_set) + len(variant_set) - len(intersec)), 4) # print("similarity : ", similarity) similarity_list.append(similarity) query_similarity[qid] = np.array(similarity_list).astype(float)
def compute_document_frequency(query_dict, doc_dict): for qno, query in query_dict.items(): for qterm in remove_stopwords(query).split(): qterm_stem = stemmer.stem(qterm.lower().strip()) print('qterm : ', qterm_stem) for docid, doc in doc_dict.items(): flag = 1 for term in doc.split(): if term == qterm_stem and flag == 1 and document_freq_dict.get(term) != 0: if document_freq_dict.get(term) is None: document_freq_dict[term] = 0 document_freq_dict[term] += 1 flag = 0 document_freq_dict[term] = 0 print('doc freq : ', qterm_stem, '\t', document_freq_dict[qterm_stem]) return document_freq_dict
def writeFile(anyList, str): w = csv.writer(open(str, 'w')) for key, value in anyList.items(): w.writerow([key, value]) with open('sms-spam-corpus.csv', newline='') as f: reader = csv.DictReader(f, delimiter=',') for row in reader: if row['v1'] == 'spam': slist.append( stemSentence( remove_stopwords( re.sub(r'[^A-Za-z]+', r' ', row['v2']).lower()))) else: hlist.append( stemSentence( remove_stopwords( re.sub(r'[^A-Za-z]+', r' ', row['v2']).lower()))) #print(slist) #print(hlist) amountOfHamSentence = len(hlist) amountOfSpamSentence = len(slist) all_sentence = amountOfHamSentence + amountOfSpamSentence P_Ham = amountOfHamSentence / all_sentence P_Spam = amountOfSpamSentence / all_sentence
return counts def writeFile(anyList, str): w = csv.writer(open(str, 'w')) for key, value in anyList.items(): w.writerow([key, value]) with open('sms-spam-corpus.csv', newline='') as f: reader = csv.DictReader(f, delimiter=',') for row in reader: if row['v1'] == 'spam': slist.append(stemSentence(remove_stopwords(re.sub(r'[^A-Za-z]+', r' ', row['v2']).lower()))) else: hlist.append(stemSentence(remove_stopwords(re.sub(r'[^A-Za-z]+', r' ', row['v2']).lower()))) # print(slist) # print(hlist) sDict = word_count(slist) hDict = word_count(hlist) # print(sDict) #print(hDict) field_names = ['word', 'count'] writeFile(sDict,'mycsvfile.csv') writeFile(hDict,'mycsvfile1.csv')
# print("VECTOR : ", trec_corpus) # else: # trec_corpus[parts[0]].append(vectors.vocab[w].index) else: oov += 1 trec_text_collection = TextCollection(trec_text_collection_data) print('all ', count, ' docs loaded') print('total ', oov, ' no. of words are not included') # load topics file trec_topics = {} # topic -> list of query term vector ids max_topic_word_count = 0 with open(arg_topics_file, 'r') as inputFile: for line in inputFile: line = remove_stopwords(line) parts = line.split(' ', 1) if parts[0] not in trec_topics: trec_topics[parts[0]] = [] for w in parts[1].split(' '): # w = w.strip() ws = stemmer.stem(w.lower().strip()) # for stemming query terms # print("QUERY : ", ws) # ws = w.strip() # if query terms should be unstemmed if ws in vectors.vocab: trec_topics[parts[0]].append(vectors.vocab[ws].index) else: print(ws, ' -- not in .vec')