def get(self): # standardize keywords = self.trimmed_stopwords(self.tokenize(self.theme, pos='noun_verbs')) # search about theme articles = self.search_articles([keyword.surface for keyword in keywords][:3]) # clean docs = map(self.clean, articles) # divide sentences sentences_cand = map(self.divide, docs) sent = [] for s in sentences_cand: sent.append(list(filter(self.is_sentence, s))) sentences = list(chain.from_iterable(sent)) # tfidf format sentence_tokens = [] for sentence in sentences: noun_tokens = [token.surface for token in self.tokenize(sentence, pos='noun')] sentence_tokens.append(' '.join(noun_tokens)) # vectorize vector = TfIdf.vector(sentence_tokens) # clustering cluster = numpy.array(TfIdf.cluster(vector, clusters=3)) # retrieve opinion with tf tfidf_score_index = numpy.argsort(numpy.array([sum(v) for v in vector.toarray()]))[::-1] opinions = [] for i in range(3): # retrieve vector index by cluster c_index = numpy.where(cluster == i) for k in tfidf_score_index: if k in c_index[0]: opinions.append(sentences[k]) break theme = namedtuple('Theme', 'keywords, opinions') return theme(' '.join([keyword.surface for keyword in keywords][:3]), opinions)
def __init__(self, corpus_filename=None, stopword_filename=None, DEFAULT_IDF=1.5): TfIdf.__init__(self, corpus_filename = corpus_filename, \ stopword_filename = stopword_filename, DEFAULT_IDF = DEFAULT_IDF) self.init_file_count()
def main(): # SETTINGS NUM_PAGES = 10000 corpus_filename = "corpus10k.txt" stopwords_filename = "stopwords10k.txt" myTfIdf = TfIdf(corpus_filename, stopwords_filename) content = [] worker_threads = [] url = "http://en.wikipedia.org/wiki/Special:Random" for i in range(NUM_PAGES): t = threading.Thread(target=clean_html_thread, args=(url, content)) t.start() worker_threads.append(t) for t in worker_threads: t.join() for t in worker_threads: if not t.isAlive(): # get results from thtead t.handled = True worker_threads = [t for t in worker_threads if not t.handled] for document in content: myTfIdf.add_input_document(document) print_keywords(document) myTfIdf.save_corpus_to_file(corpus_filename, stopwords_filename)
def createTFIDFTopics(self): self.db = sqlite3.connect(self.dbname, detect_types=sqlite3.PARSE_DECLTYPES) c = self.db.cursor() headlines = {} c.execute( "SELECT article_day,country,title,url,article_hash FROM articles_headlines" ) for row in c.fetchall(): title = row[2] # c.execute('SELECT content from articles where hash = ?',(row[4],)) # content = c.fetchone()[0] lista = headlines.get(str(row[0]) + '-' + row[1]) if lista is None: # headlines[str(row[0])+'-'+row[1]] = [title + ' ' + content] headlines[str(row[0]) + '-' + row[1]] = [title] else: # headlines[str(row[0])+'-'+row[1]].append(title + ' ' + content) headlines[str(row[0]) + '-' + row[1]].append(title) self.db.close() for hd, contents in headlines.iteritems(): print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ' + hd with open('stopwords.txt', 'r') as st: tfidf = TfIdf(stopwords=[x.strip() for x in st.readlines()]) tfidf.parse(contents)
def main(): tfidf = TfIdf(corpus_filename="moviecorpus.txt") # tfidf.add_document_to_corpus() # print tfidf.term_freq # print tfidf.num_words for line in tfidf.get_summary('oblivion.txt', 5): print line
def createTFIDFTopics(self): self.db = psycopg2.connect("dbname=%s user=%s password=%s host=%s" % ( self.dbname, self.dbuser, self.dbpass, self.dbhost)) c = self.db.cursor() headlines = {} c.execute( "SELECT article_day,country,title,url,article_hash FROM articles_headlines") for row in c.fetchall(): title = row[2] # c.execute('SELECT content from articles where hash = ?',(row[4],)) # content = c.fetchone()[0] lista = headlines.get(str(row[0])+'-'+row[1]) if lista is None: # headlines[str(row[0])+'-'+row[1]] = [title + ' ' + content] headlines[str(row[0])+'-'+row[1]] = [title] else: # headlines[str(row[0])+'-'+row[1]].append(title + ' ' + content) headlines[str(row[0])+'-'+row[1]].append(title) self.db.close() for hd, contents in headlines.items(): print(f'>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> {hd}') with open('stopwords.txt', 'r') as st: tfidf = TfIdf(stopwords=[x.strip() for x in st.readlines()]) tfidf.parse(contents)
def __init__(self, sql_obj=None): if not sql_obj: self.sql = SQLQuery() else: self.sql = sql_obj self.tfidf_obj = TfIdf() self.ids = None
def calcularfrecuencia(self, texto, palabra=[]): table = TfIdf() table.add_document("informacion", texto) resultado = table.similarities(palabra)[0][1] if resultado > 0.0: return True return False
def get(self): # standardize keywords = self.trimmed_stopwords( self.tokenize(self.opinion, pos='noun_verbs')) # search about opinion with keywords articles = self.search_articles( self.keywords + [keyword.surface for keyword in keywords][:3]) # clean docs = map(self.clean, articles) # divide sentences sentences_cand = map(self.divide, docs) sent = [] for s in sentences_cand: sent.append(list(filter(self.is_sentence, s))) sentences = list(chain.from_iterable(sent)) # tfidf format sentence_tokens = [] for sentence in sentences: noun_tokens = [ token.surface for token in self.tokenize(sentence, pos='noun') ] sentence_tokens.append(' '.join(noun_tokens)) # vectorize vector = TfIdf.vector(sentence_tokens) # clustering cluster = numpy.array(TfIdf.cluster(vector, clusters=3)) # retrieve opinion with tf tfidf_score = numpy.array([sum(v) for v in vector.toarray()]) # retrieve opinion with senti # senti_score = numpy.array([self.senti(s) for s in sentences]) senti_score = [] # for s in sentences: # senti_score.append(self.senti(s)) for sentence in sentences: senti_tokens = [ token.surface for token in self.tokenize(sentence, pos='senti') ] senti_score.append(self.senti(senti_tokens)) senti_score = numpy.array(senti_score) score_index = numpy.argsort(tfidf_score * senti_score) positives = [] negatives = [] for i in range(3): # retrieve vector index by cluster c_index = numpy.where(cluster == i) for k in score_index: if k in c_index[0]: negatives.append(sentences[k]) break for k in score_index[::-1]: if k in c_index[0]: positives.append(sentences[k]) break opinion = namedtuple('Opinion', 'positives, negatives') return opinion(positives, negatives)
def save_tfidf_like(parl_counter,sort_tfidf_like, counter_list,tot_counter,counter_list_parl): dic = dict(sort_tfidf_like) f = open(dir_out+"tfidf_like_parametros.csv", 'w') f.write("palavra"+";"+"valor"+";"+"frequencia"+";"+"entropia maxima"+";"+"entropia da palvra"+";"+"prob_politica"+";"+"entropia entre deputados"+"\n") for word in parl_counter: f.write(word+";"+str(dic[word])+";"+ '%.4f'%(TfIdf.tf(word,parl_counter))+";"+ '%.4f'%(math.log2(len(counter_list)))+";"+ '%.4f'%(TfIdf.entropy(word,tot_counter,counter_list))+";"+ '%.4f'%(TfIdf.parl_prob(word,parl_counter,counter_list))+";"+ '%.4f'%(TfIdf.parl_entropy(word, tot_counter, counter_list_parl))+"\n") f.close()
class SearchEngine: def __init__(self): self.tfidf = TfIdf() def load_documents(self, documents): for doc in documents: name = doc.name text = self.doc_to_text(doc) words = self.text_to_word_array(text) self.tfidf.add_document(name, words) def query(self, query): return self.tfidf.similarities(query) def doc_to_text(self, doc): parser = AKParser() tree = parser.parse(doc) text = '' q = deque() q.append(tree) while True: if not q: break node = q.pop() if node.tag: if 'link' == node.tag.lower(): val = node.children[0].value.split('|')[0] text += f' {val} ' continue if node.value: text += f' {node.value} ' children = node.children if children: for c in children: q.append(c) return re.sub(' +', ' ', text) def text_to_word_array(self, text): regex = re.compile('[^a-zA-Z\s]') text = regex.sub('', text) text = re.sub(' +', ' ', text) return text.lower().split()
def test_similarity(self): table = TfIdf() table.add_document("foo", ["a", "b", "c", "d", "e", "f", "g", "h"]) table.add_document("bar", ["a", "b", "c", "i", "j", "k"]) table.add_document("baz", ["k", "l", "m", "n"]) self.assertEqual( table.similarities(["a", "b", "c"]), [["foo", 0.6875], ["bar", 0.75], ["baz", 0.0]])
def build_tfidf_model(self, files): ''' It builds the Tf-Idf model :param files: List of files of the corpora :return: A Tf-Idf object with the model loaded ''' tfidf = TfIdf() for file_path in files: with open(file_path) as f: doc_name = file_path.split('/')[-1] doc_text = f.readline().split() tfidf.add_document(doc_name, doc_text) return tfidf
def test_tfidf(self): clean_tmp() t = TfIdf(self.data, root_dir) self.assertTrue(t.idf_cache['I'] < t.idf_cache['hello']) self.assertTrue(t.idf_cache['I'] < t.idf_cache['You']) self.assertTrue(t.idf_cache['I'] < t.idf_cache['not exist feature'], "test default idf_default_val") result1 = t.tfidf_in_a_doc(self.data[1]) self.assertTrue(result1['I'] < result1['You']) self.assertTrue(result1['You'] < result1['hello']) self.assertTrue(result1['hello'] == result1['world']) clean_tmp()
def postprocess_query(self, query): scores = sorted([(TfIdf.similarity(self, query, document), did) for did, document in self.documents], reverse=True) for _, did in scores[:PseudoFeedback.__num_expansions__]: query.union(self.documents[did]) return query
def tfidf_month(tw_month,random_list): tweets = list(itertools.chain.from_iterable(itertools.chain.from_iterable(tw_month))) tot_counter = Counter(tweets) dep_counts = list() for dep in tw_month: tw = list(itertools.chain.from_iterable(dep)) print(tw) dep_counts.append(Counter(tw)) docs_counter = docs_counters(random_list,tot_counter) tfidf = TfIdf() tfidf_like = list() for word in tot_counter: tfidf_like.append(tfidf.tf(word,tot_counter)*tfidf.idf_like(word,tot_counter,tot_counter,docs_counter, dep_counts)) sort_tfidf_like = list(zip(tot_counter.keys(), tfidf_like)) sort_tfidf_like = sorted(sort_tfidf_like, key=lambda x: x[1], reverse=True) return sort_tfidf_like
def __init__(self): self.query_expander = QueryExpander() self.query_expander.loadKeywords(self.__directories['Keywords']) #inits window and connects delete event print gtk.pygtk_version self.is_query_expanding_active = True self.tfidf = TfIdf(self.__directories['Documents'], self.__directories['Keywords']) print 'Dokument', self.tfidf.print_documents() self.window = gtk.Window(gtk.WINDOW_TOPLEVEL) self.window.set_title("Czesc Milosz!") self.window.connect("delete_event", self.delete_event) self.window.set_default_size(600,500) self.window.set_border_width(5) #prepare layout self.box1 = gtk.VBox(False, 5) self.window.add(self.box1) document_view = self.get_document_textarea_layout() self.box1.pack_start(self.get_menu_box(), False, False, 0) self.box1.pack_start(self.get_search_panel_layout(), False, False, 0) self.box1.pack_start(self.get_query_expander_view(), True, True, 0) self.box1.pack_start(self.get_result_layaut(), True, True, 0) self.box1.pack_start(document_view, True, True, 0) self.box1.pack_start(self.get_keywords_layout(), True, True, 0) self.show_keywords() #self.tfidf.print_stemmed_keywords() self.window.show_all()
def initialisation(): pm = Parsemail()#tain c quoi déja le truc pour écraser les méthodes??la surcharge, des constructeurs listeDmail, listeDmailRaci = pm.parsemail() fi = FichierIverse() dicoInv, nbmotsdocs, nbmotCorpus =fi.fichInv(listeDmail) dicoInvR, nbmotsdocsR, nbmotCorpusR = fi.fichInv(listeDmailRaci) ti = TfIdf(False,dicoInv,nbmotsdocs,nbmotCorpus) ti.calcul() ti.serialisation() ti2 =TfIdf(True, dicoInvR, nbmotsdocsR, nbmotCorpusR)#le true autorise la concaténation avec l'ancien dico ti2.calcul() ti2.serialisation()
def calculateTFIDFofNew(self, inputTitle, inputBody): title = self.textToWordsArray(inputTitle) sentences = self.textArrayToWordsArray(inputBody) if len(sentences) < 1: return [] table = TfIdf() for i in range(0, len(sentences)): table.add_document("sentences" + str(i), sentences[i]) result = [] similarities = table.similarities(title) for similarity in similarities: result.append(similarity[1]) resLen = len(result) for i in range(resLen, 5): result.append(0) return result
def tfidf_matrix(text_generator): """Builds tf-idf matrix from records from fname, using fields to create a text describing them """ ti = TfIdf() #print "building tfidf indices" for i in text_generator: ti.add_input_document(i) A = np.zeros([ti.num_docs, len(ti.term_num_docs)]) for i_ind, i in enumerate(text_generator): #print "-i_ind, i:", i_ind, i for j_ind, j in enumerate(ti.get_tfidf(i)): #print "-----j_ind, j:", j_ind, j A[i_ind, j_ind] = j #print ti.term_num_docs return A, ti
def test_provider(): vocab = load_pickled('vocab.dat') tfidf = TfIdf(vocab, ['./data/train_pos.txt', './data/train_neg.txt']) lda = LdaLoader('topics.lda', 200) label_vectorizer = LabelVectorizer(load_pickled('labels.dat')) stemmer = MemoizedStemmer() pos_provider = TrainingSampleProvider('./data/train_pos.txt', 1, vocab, tfidf, lda, label_vectorizer, stemmer) return pos_provider
def gen_extra_sentences(): word_idf_file = 'e:/el/tmpres/demo/merge/word_idf.txt' tfidf = TfIdf(word_idf_file) mesh_id_wid_file = 'e:/el/tmpres/demo/merge/mesh_id_wid.txt' merged_desc_file = 'e:/el/tmpres/demo/merge/merged_descriptions.txt' merged_tokenized_desc_file = 'e:/el/tmpres/demo/merge/merged_descriptions_tokenized.txt' extra_sentence_file = 'e:/el/tmpres/demo/merge/wiki_extra_sentences.txt' mesh_ids = list() wids = list() fin = open(mesh_id_wid_file, 'rb') for line in fin: vals = line.strip().split('\t') mesh_ids.append(vals[0]) wids.append(int(vals[1])) fin.close() fin_desc = open(merged_desc_file, 'rb') fin_token_desc = open(merged_tokenized_desc_file, 'rb') fout = open(extra_sentence_file, 'wb') for idx, (mesh_id, mesh_desc, mesh_token_desc) in enumerate( izip(mesh_ids, fin_desc, fin_token_desc)): mesh_token_desc = mesh_token_desc.strip() mesh_desc_words = mesh_token_desc.split(' ') mesh_sentence_ends = find_sentence_ends(mesh_desc_words) wiki_desc = fin_desc.next().strip() wiki_token_desc = fin_token_desc.next().strip() wiki_desc_words = wiki_token_desc.split(' ') wiki_sentence_ends = find_sentence_ends(wiki_desc_words) extra_sentence_indices = get_sentences_to_add(mesh_desc_words, mesh_sentence_ends, wiki_desc_words, wiki_sentence_ends, tfidf) wiki_words_to_pos_list = tokenized_text_match(wiki_desc, wiki_desc_words) original_sentences = get_original_sentences(wiki_desc, wiki_words_to_pos_list, wiki_sentence_ends) fout.write('%s\t%d\n' % (mesh_id, len(extra_sentence_indices))) for j in extra_sentence_indices: fout.write('%s\n' % original_sentences[j]) # if idx == 10000: # break fin_desc.close() fin_token_desc.close() fout.close()
def main(args): summarizer = { 'tfidf': TfIdf(), 'cluster': Cluster(), 'svd': SVD(), 'pagerank': PageRank() }[args['alg']] summarizer.initialize(args['tf'], args['df']) summary = summarizer.summarize(args['doc']) for s in summary: print(s),
def callback(ch, method, properties, body): global tweet_obj, done, starttime, skipped, processed, systime json_tweet = json.loads(body) msg = json_tweet['sanitized_text'] ts = int(json_tweet['timestamp']) msgid = int(json_tweet['id']) uid = int(json_tweet['user']['id']) tweet_obj = Tweet(msg, ts, msgid, uid) if utils.qualified(tweet_obj, TAGS, IGNORE, MIN_TOKENS): incr = TfIdf.getVals(tweet_obj) # print tweet.getVector() buckets.updateRndVec(incr) closeBuck = getClosestNeighborBuckets(tweet_obj) print(msg) if closeBuck[0] is not None: print("CLOSE BUCK: {0}, {1}".format(closeBuck[0].msg, closeBuck[1])) closeRecent = getClosestNeighborRecent(tweet_obj, closeBuck[1]) closeoverall = decideClosest(closeBuck, closeRecent) other = closeoverall[0] if other: json_tweet['nearneigh'] = other.msgid else: json_tweet['nearneigh'] = -1 json_tweet['cossim'] = closeoverall[1] channel.basic_publish( exchange='', routing_key='FYP.Q.GetStories.ClusteredTweetMessage', body=json.dumps(json_tweet), properties=pika.BasicProperties( delivery_mode=2, # make message persistent )) # sys.stdout.write(json.dumps(t) + '\n') else: print('skipped') skipped += 1 done += 1 current = int(ts) / 1000 if current - starttime > 900: aftertime = datetime.now() delta = aftertime - systime systime = aftertime dt = divmod(delta.seconds, 60) sys.stderr.write( str(done) + ' Tweets done in ' + str(dt[0]) + ' min ' + str(dt[1]) + ' sec.\n') starttime = current processed += 1 if processed % 100 == 0: print('{0} tweets processed'.format(processed))
def process_texts(self): relevant_words = [] path = os.path.join('data', 'wiki') file_names = os.listdir(path) documents = [] for file_name in file_names: file_path = os.path.join(path, file_name) f = open(file_path) documents.append((file_name, TextBlob(str.decode(f.read(), 'UTF-8', 'ignore')))) f.close() tfidf = TfIdf(documents) for file_name, document in documents: print file_name scores = {word: tfidf.compute_tfidf(word, document) for word in document.words} selected_scores = {} for word in scores: similars = sorted(self.get_similar(scores.keys(), word)) selected_scores[similars[-1]] = scores[word] sorted_words = sorted(selected_scores.items(), key=lambda x: x[1], reverse=True) for word, score in sorted_words[:10]: if word not in relevant_words: relevant_words.append(word) return set(relevant_words)
def open_file(self, widget, name): text = "Select {0} Source File".format(name) filechooserdialog = gtk.FileChooserDialog(text, None, gtk.FILE_CHOOSER_ACTION_OPEN, (gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL, gtk.STOCK_OK, gtk.RESPONSE_OK)) response = filechooserdialog.run() if response == gtk.RESPONSE_OK: self.__directories[name] = filechooserdialog.get_filename() self.tfidf = TfIdf(self.__directories['Documents'], self.__directories['Keywords']) self.query_expander = QueryExpander() self.query_expander.loadKeywords(self.__directories['Keywords']) print "directories" print self.__directories #self.tfidf.print_stemmed_keywords() self.show_keywords() filechooserdialog.destroy()
def callback(ch, method, properties, body): global tweet_obj, done, starttime, skipped, processed, systime json_tweet = json.loads(body) msg = json_tweet["sanitized_text"] ts = int(json_tweet["timestamp"]) msgid = int(json_tweet["id"]) uid = int(json_tweet["user"]["id"]) tweet_obj = Tweet(msg, ts, msgid, uid) if utils.qualified(tweet_obj, TAGS, IGNORE, MIN_TOKENS): incr = TfIdf.getVals(tweet_obj) # print tweet.getVector() buckets.updateRndVec(incr) closeBuck = getClosestNeighborBuckets(tweet_obj) print msg if closeBuck[0] is not None: print "CLOSE BUCK: {0}, {1}".format(closeBuck[0].msg, closeBuck[1]) closeRecent = getClosestNeighborRecent(tweet_obj, closeBuck[1]) closeoverall = decideClosest(closeBuck, closeRecent) other = closeoverall[0] if other: json_tweet["nearneigh"] = other.msgid else: json_tweet["nearneigh"] = -1 json_tweet["cossim"] = closeoverall[1] channel.basic_publish( exchange="", routing_key="FYP.Q.GetStories.ClusteredTweetMessage", body=json.dumps(json_tweet), properties=pika.BasicProperties(delivery_mode=2), # make message persistent ) # sys.stdout.write(json.dumps(t) + '\n') else: print "skipped" skipped += 1 done += 1 current = int(ts) / 1000 if current - starttime > 900: aftertime = datetime.now() delta = aftertime - systime systime = aftertime dt = divmod(delta.seconds, 60) sys.stderr.write(str(done) + " Tweets done in " + str(dt[0]) + " min " + str(dt[1]) + " sec.\n") starttime = current processed += 1 if processed % 100 == 0: print "{0} tweets processed".format(processed)
def get_sentences_to_add(prev_text_words, prev_sentence_ends, new_text_words, new_sentence_ends, tfidf): prev_tfidf_vecs = get_tfidf_of_sentences(prev_text_words, prev_sentence_ends, tfidf) new_tfidf_vecs = get_tfidf_of_sentences(new_text_words, new_sentence_ends, tfidf) wanted_sentence_indices = list() for nidx, new_tfidf_vec in enumerate(new_tfidf_vecs): to_add = True for pidx, prev_tfidf_vec in enumerate(prev_tfidf_vecs): sim_val = TfIdf.sim(new_tfidf_vec, prev_tfidf_vec) if sim_val > 0.95: to_add = False # print sim_val, 'too similar' break if to_add: wanted_sentence_indices.append(nidx) return wanted_sentence_indices
def train_setup(vocab_file, pos_file, neg_file, cluster_labels_file, validation_file): vocab = load_pickled(vocab_file) tfidf = TfIdf(vocab, [pos_file, neg_file]) label_vectorizer = LabelVectorizer(load_pickled(cluster_labels_file)) stemmer = MemoizedStemmer() pos_provider = TrainingSampleProvider(pos_file, 1, vocab, tfidf, label_vectorizer, stemmer) neg_provider = TrainingSampleProvider(neg_file, -1, vocab, tfidf, label_vectorizer, stemmer) merged = SampleMerger(pos_provider, neg_provider) validation_provider = ValidationSampleProvider(validation_file, None, vocab, tfidf, label_vectorizer, stemmer) return merged, validation_provider
def train_setup(): vocab = load_pickled('vocab.dat') tfidf = TfIdf(vocab, ['./data/train_pos.txt', './data/train_neg.txt']) lda = LdaLoader('topics.lda', 200) label_vectorizer = LabelVectorizer(load_pickled('labels.dat')) stemmer = MemoizedStemmer() pos_provider = TrainingSampleProvider('./data/train_pos.txt', 1, vocab, tfidf, lda, label_vectorizer, stemmer) neg_provider = TrainingSampleProvider('./data/train_neg.txt', -1, vocab, tfidf, lda, label_vectorizer, stemmer) merged = SampleMerger(pos_provider, neg_provider) validation_provider = ValidationSampleProvider('./data/test_data.txt', None, vocab, tfidf, lda, label_vectorizer, stemmer) return merged, validation_provider
class KeyWordGetter(): """ Class to determine the significant unique keywords of a page. Uses TF-IDF algorithm (http://en.wikipedia.org/wiki/Tf%E2%80%93idf). """ def __init__(self): self.myTfIdf = TfIdf("corpus10k.txt", "stopwords10k.txt") opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] self.opener = opener def get_keywords_from_url(self, url, num_words=5, MAX_STR_LEN=1000): """ Returns a list of tuples containing unique keywords of a page given a url and their significance as number between (0,1) using TF-IDF algorithm as a tuple. """ clean_text = self.get_clean_text(url) if len(clean_text) > MAX_STR_LEN: clean_text = clean_text[:MAX_STR_LEN] keywords = [] for pair in self.myTfIdf.get_doc_keywords(clean_text)[0: num_words]: keywords.append(pair) return keywords def get_clean_text(self, URL): """ Returns the contents of a url's html page with tags removed """ response = self.opener.open(URL) html = response.read() return nltk.clean_html(html)
alea_processed.append(get_bigrams(temp,3,True)) with open(dir_out+"list_alea_bigrams.pck", 'wb') as handle: pickle.dump(alea_processed, handle) with open(dir_out+"list_alea_trigrams.pck", 'wb') as handle: pickle.dump(alea_tri_processed, handle) bgr_counter = parl_bigrams.ngram_fd parl_bgr_counter = [l.ngram_fd for l in parl_processed] docs_bgr_counter = [l.ngram_fd for l in alea_processed] docs_bgr_counter.append(bgr_counter) tfidf = TfIdf() tfidf_smooth = list() for bgr in bgr_counter: tfidf_smooth.append(tfidf.tf(bgr,bgr_counter)*tfidf.idf_smooth(bgr,docs_bgr_counter)) dic_tfidf_smooth = list(zip(bgr_counter.keys(), tfidf_smooth)) dic_tfidf_smooth = sorted(dic_tfidf_smooth, key=lambda x: x[1], reverse=True) tot_counter = dict() for y in docs_bgr_counter: for k in y.keys(): tot_counter[k] = k in tot_counter and tot_counter[k]+y[k] or y[k] tfidf_like = list() for bgr in bgr_counter: tfidf_like.append(tfidf.tf(bgr,bgr_counter)*tfidf.idf_like(bgr,bgr_counter,tot_counter,docs_bgr_counter, parl_bgr_counter))
class TweetAdder: #@perftest def __init__(self, sql_obj=None): if not sql_obj: self.sql = SQLQuery() else: self.sql = sql_obj self.tfidf_obj = TfIdf() self.ids = None def addTimelineTweet(self, timeline_tweet): """ Converts timeline tweet to search api format and adds it as a celebrity tweet. """ tweet = self.convertTimelineTweet(timeline_tweet) self.add(tweet, created_at_is_obj=True, tweet_table="tweets") def addNonCelebTimelineTweet(self, timeline_tweet): """ Converts timeline tweet to Search API format and adds it as a non-celebrity tweet. """ self.add(self.convertTimelineTweet(timeline_tweet), created_at_is_obj=True, tweet_table="tweets_non_celeb") def convertTimelineTweet(self, timeline_tweet): """ Converts a timeline tweet to the format returned by the Search API. """ tweet = {} created_at = replaceMonth(timeline_tweet['created_at']) dt = datetime.datetime(int(created_at[25:]), int(created_at[4:6]), int(created_at[7:9]), int(created_at[10:12]), int(created_at[13:15]), int(created_at[16:18])) tweet['created_at'] = dt tweet['from_user'] = timeline_tweet['user']['screen_name'] tweet['from_user_id'] = timeline_tweet['user']['id'] tweet['from_user_name'] = timeline_tweet['user']['name'] tweet['geo'] = timeline_tweet['user']['location'] tweet['id'] = timeline_tweet['id'] tweet['iso_language_code'] = timeline_tweet['user']['lang'] tweet['metadata'] = {'result_type':'timeline'} tweet['profile_image_url'] = timeline_tweet['user']['profile_image_url'] tweet['source'] = timeline_tweet['source'] tweet['text'] = timeline_tweet['text'] tweet['to_user'] = timeline_tweet['in_reply_to_screen_name'] tweet['to_user_id'] = timeline_tweet['in_reply_to_user_id'] tweet['to_user_name'] = None return tweet def add(self, tweet, created_at_is_obj=False, tweet_table="tweets"): """ Adds a tweet to tweet_table (celebrity tweet table by default). Tweet must be in the format provided by Search API. """ if not self.ids: self.ids = [i[0] for i in self.sql.q("SELECT id FROM tweets")] debuglog.msg("Inserting tweet", tweet['id']) #debuglog.pprint_msg(tweet) if not created_at_is_obj: dt = datetime.datetime.strptime(replaceMonth(tweet['created_at'][5:25]),"%d %m %Y %H:%M:%S") else: dt = tweet['created_at'] created_at = dt.strftime("%Y-%m-%d %H:%M:%S") dicvals = {'created_at':created_at, 'from_user':tweet['from_user'], 'from_user_id':tweet['from_user_id'], 'from_user_name':tweet['from_user_name'], 'geo':str(tweet['geo']), 'id':tweet['id'], 'iso_language_code':tweet['iso_language_code'], 'metadata':str(tweet['metadata']), 'profile_image_url':tweet['profile_image_url'], 'source':tweet['source'], 'text':tweet['text'], 'to_user':tweet['to_user'], 'to_user_id':tweet['to_user_id'], 'to_user_name':tweet['to_user_name']} dicq= "INSERT IGNORE INTO " + tweet_table dicq += """ VALUES(%(created_at)s, %(from_user)s, %(from_user_id)s, %(from_user_name)s, %(geo)s, %(id)s, %(iso_language_code)s, %(metadata)s, %(profile_image_url)s, %(source)s, %(text)s, %(to_user)s, %(to_user_id)s, %(to_user_name)s)""" if tweet['id'] not in self.ids: succeeded = False try: self.sql.q(dicq,dicvals) succeeded = True except UnicodeEncodeError: try: debuglog.msg("\tUNIDECODE ERROR, trying decode...") for k in dicvals: dicvals[k] = unidecode(dicvals[k]) self.sql.q(dicq,dicvals) succeeded = True except: debuglog.msg("\tUnidecode failed :(") if succeeded and tweet_table == 'tweets': tokens = self.tfidf_obj.get_tokens(tweet['text']) self.addTokens(tweet,tokens) self.addTokenMapping(tweet, tokens) return succeeded debuglog.msg("\ttweet already existed") return False def addTokens(self, tweet, tokens=None): if tokens is None: txt = tweet['text'] tokens = self.tfidf_obj.get_tokens(txt) if not tokens or not len(tokens): return count = 0 vals = {} q = "INSERT IGNORE INTO tokens (token, type) VALUES" for token in tokens: #print(token) vals['token'+str(count)] = token[0] vals['type'+str(count)] = token[1] q += "(%(token"+str(count)+")s, %(type"+str(count)+")s)," count += 1 q = q[:len(q)-1] #remove last comma self.sql.q(q,vals) def addTokenMapping(self, tweet, tokens=None): if tokens is None: txt = tweet['text'] tokens = self.tfidf_obj.get_tokens(txt) if not tokens or not len(tokens): return count = 0 vals = {'user':tweet['from_user'], 'tweet_id':tweet['id']} q = "INSERT INTO token_user_mapping (user, token, tweet_id) VALUES" for token in tokens: q += "(%(user)s, %(token"+str(count)+")s, %(tweet_id)s)," vals['token'+str(count)] = token[0] count+=1 #print("token mapping query",q) q = q[:len(q)-1] #remove last comma self.sql.q(q,vals) def deleteCeleb(self, celeb): """ Back up and delete data for a celeb who doesn't make the cut. """ self.backupCeleb(celeb) vals = {'celeb':celeb} q = "DELETE FROM celebs WHERE user=%(celeb)s" self.sql.q(q, vals) q = "DELETE FROM celeb_tfidf WHERE user=%(celeb)s" self.sql.q(q, vals) q = "DELETE FROM token_user_mapping WHERE user=%(celeb)s" self.sql.q(q, vals) q = "DELETE FROM tweets WHERE from_user=%(celeb)s" self.sql.q(q, vals) print("Deleted",celeb) def backupCeleb(self, celeb): """ Back up data for a celeb (before deleting them). """ vals = {'celeb':celeb} q = "INSERT INTO celebs_deleted (user) VALUES(%(celeb)s)" self.sql.q(q, vals) q = "INSERT INTO celeb_tfidf_deleted (SELECT * FROM celeb_tfidf_all WHERE user=%(celeb)s)" self.sql.q(q, vals) q = "INSERT INTO token_user_mapping_deleted (SELECT * FROM token_user_mapping WHERE user=%(celeb)s)" self.sql.q(q, vals) q = "INSERT INTO tweets_deleted (SELECT * FROM tweets WHERE from_user=%(celeb)s)" self.sql.q(q, vals) def fixTokens(self): q = "SELECT text, from_user, id FROM tweets" results = self.sql.q(q) failures = [] f = open('token_fix_failures.txt','w') for result in results: debuglog.msg("Adding tokens for tweet",result[2]) try: self.addTokens({'text':result[0], 'from_user':result[1]}) self.addTokenMapping({'text':result[0], 'from_user':result[1], 'id':result[2]}) except: failures.append(result[2]) debuglog.msg("\tAdding tokens failed!") debuglog.msg("\tFailures so far:",len(failures)) f.write(result[2]+"\n") f.close() debuglog.msg(failures) def identityMissingTweets(self): f = open('added_tweets.txt') added_tweet_ids = [int(line.replace('\n','')) for line in f.readlines()] f.close() #pprint.pprint(added_tweet_ids) q = "SELECT id FROM tweets" all_tweet_ids = [result[0] for result in self.sql.q(q)] missing_tweets = list(filter(lambda x: x not in added_tweet_ids, all_tweet_ids)) pprint.pprint(missing_tweets) def fixTokensInterrupted(self): f = open('missing_tweets2.txt') missing_tweets = [line.replace('\n','') for line in f.readlines()] q = "SELECT text, from_user, id FROM tweets WHERE id IN(" vals = {} q += ','.join(missing_tweets) + ')' results = self.sql.q(q) #pprint.pprint(results) #return failures = [] f = open('token_fix_failures.txt','w') for result in results: debuglog.msg("Adding tokens for tweet",result[2]) try: self.addTokens({'text':result[0], 'from_user':result[1]}) self.addTokenMapping({'text':result[0], 'from_user':result[1], 'id':result[2]}) except: failures.append(result[2]) debuglog.msg("\tAdding tokens failed!") debuglog.msg("\tFailures so far:",len(failures)) f.write(result[2]+"\n") f.close() debuglog.msg(failures) def addTweetIdsToTokenUserMapping(self): q = "SELECT user FROM celebs WHERE user!='ladygaga'" celebs = [result[0] for result in self.sql.q(q)] count = 0 for celeb in celebs: print("Adding tweet ids for", celeb) q = "SELECT id, text FROM tweets WHERE from_user=%(celeb)s" vals = {'celeb':celeb} celeb_tweets = self.sql.q(q, vals) num_tweets = str(len(celeb_tweets)) tweet_count = 0 for tweet in celeb_tweets: tokens = self.tfidf_obj.get_tokens(tweet[1]) for token in tokens: vals = {'celeb':celeb, 'token':token[0], 'tweet_id':tweet[0]} q = "UPDATE token_user_mapping SET tweet_id=%(tweet_id)s WHERE user=%(celeb)s AND token=%(token)s AND tweet_id is null LIMIT 1;" self.sql.q(q, vals) tweet_count += 1 print("\t%s/%s tweets."%(tweet_count, num_tweets)) count += 1 print("%s%% of celebs updated."% str(100*count/float(len(celebs))))
from tfidf import TfIdf import pandas as pd corpuspath = '/Users/goksukara/Desktop/Projects/EclipseWorkspace/Specilization/PhytonCode/Data/' if __name__ == "__main__": Tf_idf = TfIdf(corpuspath + 'Gensim_output') Tf_idf.loaddictionary() Tf_idf.buildmodel() Tf_idf.saveModel() Tf_idf.getTF_IDF() #print(Tf_idf.corpus_dict) #Tf_idf.listnhighIdfs(4)
dir_in = "/Users/lucasso/Dropbox/UFMG/Processamento de Linguagem Natural/random_pck/docs/" dir_parl = "/Users/lucasso/Documents/pck/" dir_out = "/Users/lucasso/Dropbox/UFMG/Processamento de Linguagem Natural/" file_parl = "/Users/lucasso/Dropbox/UFMG/Processamento de Linguagem Natural/random_pck/docs/deputados.pck" tfidf_n = list() tf_log_idf = list() tfidf_like = list() corr = "" with open(file_parl, 'rb') as handle: parl_counter = pickle.load(handle) tot_counter,counter_list,_ = loadCounters(dir_in) tot_counter_dep,counter_list_dep,pck= loadCounters(dir_parl) tfidf = TfIdf() for word in parl_counter: tf = tfidf.tf(word, parl_counter) idf = tfidf.idf(word,counter_list) log_idf = tfidf.idf_smooth(word,counter_list) ent_idf = tfidf.idf_like(word,parl_counter, tot_counter, counter_list, counter_list_dep) tfidf_n.append(tf*idf) tf_log_idf.append(tf*log_idf) tfidf_like.append(tf*ent_idf) dic_tfidf= list(zip(parl_counter.keys(), tfidf_n)) dic_tf_log_idf= list(zip(parl_counter.keys(), tf_log_idf)) dic_tfidf_like= list(zip(parl_counter.keys(), tfidf_like)) """ corr += "tfidf X tfidf_smooth: "+str(stats.spearmanr([v for i,v in dic_tfidf] ,[v for i,v in dic_tf_log_idf]))+"\n"
class TestSequenceFunctions(unittest.TestCase): def setUp(self): self.unk_cutoff = 2 self.vocab = TfIdf(unk_cutoff=self.unk_cutoff) def test_vocab(self): self.vocab.train_seen("a", 300) self.vocab.train_seen("b") self.vocab.train_seen("c") self.vocab.finalize() # Infrequent words should look the same self.assertEqual(self.vocab.vocab_lookup("b"), self.vocab.vocab_lookup("c")) # Infrequent words should look the same as never seen words self.assertEqual(self.vocab.vocab_lookup("b"), self.vocab.vocab_lookup("d"), "") # The frequent word should be different from the infrequent word self.assertNotEqual(self.vocab.vocab_lookup("a"), self.vocab.vocab_lookup("b")) def test_censor(self): self.vocab.train_seen("a", 300) self.vocab.train_seen("b") self.vocab.train_seen("c") self.vocab.finalize() censored_a = [str(x) for x in self.vocab.tokenize("a b d")] censored_b = [str(x) for x in self.vocab.tokenize("d b a")] censored_c = [str(x) for x in self.vocab.tokenize("a b d")] censored_d = [str(x) for x in self.vocab.tokenize("b d a")] self.assertEqual(censored_a, censored_c) self.assertEqual(censored_b, censored_d) # Should add start and end tag print(censored_a) self.assertEqual(len(censored_a), 3) self.assertEqual(censored_a[0], censored_b[2]) self.assertEqual(censored_a[1], censored_b[0]) def test_tf(self): self.vocab.train_seen("a", 300) self.vocab.finalize() self.vocab.add_document("a a b") # Test MLE word_a = self.vocab.vocab_lookup("a") word_b = self.vocab.vocab_lookup("b") word_c = self.vocab.vocab_lookup("c") self.assertAlmostEqual(self.vocab.term_freq(word_a), 0.66666666) self.assertAlmostEqual(self.vocab.term_freq(word_b), 0.33333333) self.assertAlmostEqual(self.vocab.term_freq(word_c), 0.33333333) def test_df(self): self.vocab.train_seen("a", 300) self.vocab.train_seen("b", 100) self.vocab.finalize() self.vocab.add_document("a a b") self.vocab.add_document("b b c") self.vocab.add_document("a a a") self.vocab.add_document("a a a") # Test MLE word_a = self.vocab.vocab_lookup("a") word_b = self.vocab.vocab_lookup("b") word_c = self.vocab.vocab_lookup("c") word_d = self.vocab.vocab_lookup("d") self.assertAlmostEqual(self.vocab.inv_docfreq(word_a), log10(1.3333333)) self.assertAlmostEqual(self.vocab.inv_docfreq(word_b), log10(2.0)) self.assertAlmostEqual(self.vocab.inv_docfreq(word_c), log10(4.0)) self.assertAlmostEqual(self.vocab.inv_docfreq(word_d), log10(4.0))
categories_counter.append(Counter(tmp)) test_data.append(categ[:k]) print("process tfidf") tfidf_entropy = list() tfidf_smooth = list() tfidf_like = list() for i , data in enumerate(categories_counter): tmp_smooth = dict() tmp_like = dict() tmp_entropy = dict() print("dataset: " + str(i)) for word in data: tf = TfIdf.tf(word, data) tmp_entropy[word] = tf * TfIdf.idf_entropy(word, i, categories_counter) tmp_smooth[word] = tf * TfIdf.idf_smooth(word, categories_counter) tmp_like[word] = tf * TfIdf.idf_like(word, i, categories_counter) tfidf_smooth.append(tmp_smooth) tfidf_like.append(tmp_like) tfidf_entropy.append(tmp_entropy) print("processing softmax confusion matrix") confusion_like = np.zeros(shape=(len(test_data), len(test_data))) confusion_smooth = np.zeros(shape=(len(test_data), len(test_data))) confusion_entropy = np.zeros(shape=(len(test_data), len(test_data))) for i, data in enumerate(test_data): for tw in data: j, value = classifier_s(tw, tfidf_like) confusion_like[i, j] += 1
except IndexError: save_file = 'pickled_tfidf.pickle' print "saving to ", save_file try: with open(save_file) as rh: top_100 = cPickle.load(rh) except IOError: top_100 = {} print "proceeding with", len(top_100), "previous tfidf docs" with open(save_file, 'w') as wh: wh.write('0\n') comment_model = TfIdf(corpus_filename="idf_model_filteredsorted.txt", stopword_filename="curated_stopwords.txt", DEFAULT_IDF=0.0000001) #if not in idf model, give very low score, since model is filtered #find the number of beers for progress indication c.execute("SELECT id from beer") total_beers = len(list(c.fetchall())) print "calculating tfidf of ", total_beers, "beers." c.execute("SELECT id, name FROM beer") idx = 0 #don't want to unwrap the generator so we'll idex this way worked = 0 for beer_id, name in c.fetchall(): if idx%1000 == 0: print """*-*-*-* Finished {0}% of the processing.""".format(float(idx)/total_beers) with open(save_file, 'w') as wh: cPickle.dump(top_100, wh) idx += 1
def __init__(self, corpus_filename = None, stopword_filename = None, DEFAULT_IDF = 1.5): TfIdf.__init__(self, corpus_filename = corpus_filename, \ stopword_filename = stopword_filename, DEFAULT_IDF = DEFAULT_IDF) self.init_file_count()
tweets = list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(parl_tw_processed)))) tot_counter = Counter(tweets) parl_counters = list() for parl in parl_tw_processed: tw = list(itertools.chain.from_iterable(parl)) parl_counters.append(Counter(tw)) docs_counter =list() docs_counter.append(tot_counter) docs_counter.append(coleta1) docs_counter.append(coleta2) tfidf = TfIdf() tfidf_like = list() for word in tot_counter: tfidf_like.append(tfidf.tf(word,tot_counter)*tfidf.idf_like(word,tot_counter,tot_counter,docs_counter, parl_counters)) sort_tfidf_like = list(zip(tot_counter.keys(), tfidf_like)) sort_tfidf_like = sorted(sort_tfidf_like, key=lambda x: x[1], reverse=True) with open(dir_rob+"sort_tfidf_like.pck", 'wb') as handle: pickle.dump(sort_tfidf_like, handle) with open(dir_rob+"tfidf_like.pck", 'wb') as handle: pickle.dump(tfidf_like, handle) with open(dir_rob+"parl_tw_processed.pck", 'wb') as handle:
class TweetAdder: #@perftest def __init__(self, sql_obj=None): if not sql_obj: self.sql = SQLQuery() else: self.sql = sql_obj self.tfidf_obj = TfIdf() self.ids = None def addTimelineTweet(self, timeline_tweet): """ Converts timeline tweet to search api format and adds it as a celebrity tweet. """ tweet = self.convertTimelineTweet(timeline_tweet) self.add(tweet, created_at_is_obj=True, tweet_table="tweets") def addNonCelebTimelineTweet(self, timeline_tweet): """ Converts timeline tweet to Search API format and adds it as a non-celebrity tweet. """ self.add(self.convertTimelineTweet(timeline_tweet), created_at_is_obj=True, tweet_table="tweets_non_celeb") def convertTimelineTweet(self, timeline_tweet): """ Converts a timeline tweet to the format returned by the Search API. """ tweet = {} created_at = replaceMonth(timeline_tweet['created_at']) dt = datetime.datetime(int(created_at[25:]), int(created_at[4:6]), int(created_at[7:9]), int(created_at[10:12]), int(created_at[13:15]), int(created_at[16:18])) tweet['created_at'] = dt tweet['from_user'] = timeline_tweet['user']['screen_name'] tweet['from_user_id'] = timeline_tweet['user']['id'] tweet['from_user_name'] = timeline_tweet['user']['name'] tweet['geo'] = timeline_tweet['user']['location'] tweet['id'] = timeline_tweet['id'] tweet['iso_language_code'] = timeline_tweet['user']['lang'] tweet['metadata'] = {'result_type': 'timeline'} tweet['profile_image_url'] = timeline_tweet['user'][ 'profile_image_url'] tweet['source'] = timeline_tweet['source'] tweet['text'] = timeline_tweet['text'] tweet['to_user'] = timeline_tweet['in_reply_to_screen_name'] tweet['to_user_id'] = timeline_tweet['in_reply_to_user_id'] tweet['to_user_name'] = None return tweet def add(self, tweet, created_at_is_obj=False, tweet_table="tweets"): """ Adds a tweet to tweet_table (celebrity tweet table by default). Tweet must be in the format provided by Search API. """ if not self.ids: self.ids = [i[0] for i in self.sql.q("SELECT id FROM tweets")] debuglog.msg("Inserting tweet", tweet['id']) #debuglog.pprint_msg(tweet) if not created_at_is_obj: dt = datetime.datetime.strptime( replaceMonth(tweet['created_at'][5:25]), "%d %m %Y %H:%M:%S") else: dt = tweet['created_at'] created_at = dt.strftime("%Y-%m-%d %H:%M:%S") dicvals = { 'created_at': created_at, 'from_user': tweet['from_user'], 'from_user_id': tweet['from_user_id'], 'from_user_name': tweet['from_user_name'], 'geo': str(tweet['geo']), 'id': tweet['id'], 'iso_language_code': tweet['iso_language_code'], 'metadata': str(tweet['metadata']), 'profile_image_url': tweet['profile_image_url'], 'source': tweet['source'], 'text': tweet['text'], 'to_user': tweet['to_user'], 'to_user_id': tweet['to_user_id'], 'to_user_name': tweet['to_user_name'] } dicq = "INSERT IGNORE INTO " + tweet_table dicq += """ VALUES(%(created_at)s, %(from_user)s, %(from_user_id)s, %(from_user_name)s, %(geo)s, %(id)s, %(iso_language_code)s, %(metadata)s, %(profile_image_url)s, %(source)s, %(text)s, %(to_user)s, %(to_user_id)s, %(to_user_name)s)""" if tweet['id'] not in self.ids: succeeded = False try: self.sql.q(dicq, dicvals) succeeded = True except UnicodeEncodeError: try: debuglog.msg("\tUNIDECODE ERROR, trying decode...") for k in dicvals: dicvals[k] = unidecode(dicvals[k]) self.sql.q(dicq, dicvals) succeeded = True except: debuglog.msg("\tUnidecode failed :(") if succeeded and tweet_table == 'tweets': tokens = self.tfidf_obj.get_tokens(tweet['text']) self.addTokens(tweet, tokens) self.addTokenMapping(tweet, tokens) return succeeded debuglog.msg("\ttweet already existed") return False def addTokens(self, tweet, tokens=None): if tokens is None: txt = tweet['text'] tokens = self.tfidf_obj.get_tokens(txt) if not tokens or not len(tokens): return count = 0 vals = {} q = "INSERT IGNORE INTO tokens (token, type) VALUES" for token in tokens: #print(token) vals['token' + str(count)] = token[0] vals['type' + str(count)] = token[1] q += "(%(token" + str(count) + ")s, %(type" + str(count) + ")s)," count += 1 q = q[:len(q) - 1] #remove last comma self.sql.q(q, vals) def addTokenMapping(self, tweet, tokens=None): if tokens is None: txt = tweet['text'] tokens = self.tfidf_obj.get_tokens(txt) if not tokens or not len(tokens): return count = 0 vals = {'user': tweet['from_user'], 'tweet_id': tweet['id']} q = "INSERT INTO token_user_mapping (user, token, tweet_id) VALUES" for token in tokens: q += "(%(user)s, %(token" + str(count) + ")s, %(tweet_id)s)," vals['token' + str(count)] = token[0] count += 1 #print("token mapping query",q) q = q[:len(q) - 1] #remove last comma self.sql.q(q, vals) def deleteCeleb(self, celeb): """ Back up and delete data for a celeb who doesn't make the cut. """ self.backupCeleb(celeb) vals = {'celeb': celeb} q = "DELETE FROM celebs WHERE user=%(celeb)s" self.sql.q(q, vals) q = "DELETE FROM celeb_tfidf WHERE user=%(celeb)s" self.sql.q(q, vals) q = "DELETE FROM token_user_mapping WHERE user=%(celeb)s" self.sql.q(q, vals) q = "DELETE FROM tweets WHERE from_user=%(celeb)s" self.sql.q(q, vals) print("Deleted", celeb) def backupCeleb(self, celeb): """ Back up data for a celeb (before deleting them). """ vals = {'celeb': celeb} q = "INSERT INTO celebs_deleted (user) VALUES(%(celeb)s)" self.sql.q(q, vals) q = "INSERT INTO celeb_tfidf_deleted (SELECT * FROM celeb_tfidf_all WHERE user=%(celeb)s)" self.sql.q(q, vals) q = "INSERT INTO token_user_mapping_deleted (SELECT * FROM token_user_mapping WHERE user=%(celeb)s)" self.sql.q(q, vals) q = "INSERT INTO tweets_deleted (SELECT * FROM tweets WHERE from_user=%(celeb)s)" self.sql.q(q, vals) def fixTokens(self): q = "SELECT text, from_user, id FROM tweets" results = self.sql.q(q) failures = [] f = open('token_fix_failures.txt', 'w') for result in results: debuglog.msg("Adding tokens for tweet", result[2]) try: self.addTokens({'text': result[0], 'from_user': result[1]}) self.addTokenMapping({ 'text': result[0], 'from_user': result[1], 'id': result[2] }) except: failures.append(result[2]) debuglog.msg("\tAdding tokens failed!") debuglog.msg("\tFailures so far:", len(failures)) f.write(result[2] + "\n") f.close() debuglog.msg(failures) def identityMissingTweets(self): f = open('added_tweets.txt') added_tweet_ids = [ int(line.replace('\n', '')) for line in f.readlines() ] f.close() #pprint.pprint(added_tweet_ids) q = "SELECT id FROM tweets" all_tweet_ids = [result[0] for result in self.sql.q(q)] missing_tweets = list( filter(lambda x: x not in added_tweet_ids, all_tweet_ids)) pprint.pprint(missing_tweets) def fixTokensInterrupted(self): f = open('missing_tweets2.txt') missing_tweets = [line.replace('\n', '') for line in f.readlines()] q = "SELECT text, from_user, id FROM tweets WHERE id IN(" vals = {} q += ','.join(missing_tweets) + ')' results = self.sql.q(q) #pprint.pprint(results) #return failures = [] f = open('token_fix_failures.txt', 'w') for result in results: debuglog.msg("Adding tokens for tweet", result[2]) try: self.addTokens({'text': result[0], 'from_user': result[1]}) self.addTokenMapping({ 'text': result[0], 'from_user': result[1], 'id': result[2] }) except: failures.append(result[2]) debuglog.msg("\tAdding tokens failed!") debuglog.msg("\tFailures so far:", len(failures)) f.write(result[2] + "\n") f.close() debuglog.msg(failures) def addTweetIdsToTokenUserMapping(self): q = "SELECT user FROM celebs WHERE user!='ladygaga'" celebs = [result[0] for result in self.sql.q(q)] count = 0 for celeb in celebs: print("Adding tweet ids for", celeb) q = "SELECT id, text FROM tweets WHERE from_user=%(celeb)s" vals = {'celeb': celeb} celeb_tweets = self.sql.q(q, vals) num_tweets = str(len(celeb_tweets)) tweet_count = 0 for tweet in celeb_tweets: tokens = self.tfidf_obj.get_tokens(tweet[1]) for token in tokens: vals = { 'celeb': celeb, 'token': token[0], 'tweet_id': tweet[0] } q = "UPDATE token_user_mapping SET tweet_id=%(tweet_id)s WHERE user=%(celeb)s AND token=%(token)s AND tweet_id is null LIMIT 1;" self.sql.q(q, vals) tweet_count += 1 print("\t%s/%s tweets." % (tweet_count, num_tweets)) count += 1 print("%s%% of celebs updated." % str(100 * count / float(len(celebs))))
print("Getting data from " + url.strip() + "...", end="", flush=True) response = get(url=url) print("done!\nParsing HTML data...", end="", flush=True) parser.feed(response.text) print("done!") keydict = parser.get_keydict() urldata = {"url": url, "keywords": keydict} id_md5 = hashlib.md5(url.encode()).hexdigest() docs[id_md5] = urldata ti = TfIdf(docs) for kd, d in docs.items(): print("Processing document " + kd + "...", end="", flush=True) for kw, t in d['keywords'].items(): docs[kd]['keywords'][kw]['tf_idf'] = ti.tf_idf(kw, kd) print("done!") fname = 'webdirectory.txt' print("Saving to file " + fname + "...", end="", flush=True) with open(fname, 'w') as file: file.write(json.dumps(docs, sort_keys=False)) print("done!\nCompleted!")
if __name__=='__main__': cf = configparser.ConfigParser() cf.read("file_path.properties") path = dict(cf.items("file_path")) dir_in = path['dir_in'] dir_out = path['dir_out'] dir_ale = path['dir_ale'] dir_rob = path['dir_rob'] tot_counter, parl_counter_list = load_counters(dir_out) tp = TextProcessor() tfidf = TfIdf() word_entropy = dict() for word in tot_counter: word_entropy[word] = tfidf.parl_entropy(word,tot_counter,parl_counter_list) freq = [int(math.pow(2,x)) for x in word_entropy.values() ] plt.hist(freq, 15) plt.xticks(np.arange(0,max(freq),20)) #plt.gca().set_yscale("log") plt.xlabel("# de deputados que utilizaram a palavra" ) plt.ylabel("# palavras utilizadas pelos deputados" ) plt.show() plt.clf()
except IndexError: if (len(results) == 0): print("No results were found for this query.") exit() else: pass for posting in list_of_postings: results = set(results).intersection(posting) if (len(important) > 0): for res in results: important.append(res) #vectorizer = TfidfVectorizer() table = TfIdf() G = nx.Graph() #return urls corresponding to numbers with open("url_files.csv") as f: urls = [row for row in csv.reader(f)] if len(results) != 0: for x in results: f = open(urls[x - 1][0]) obj = json.load(f) soup = BeautifulSoup(obj["content"], "html.parser", from_encoding="iso-8859-1") joinedText = [
class GUI: __directories = { "Documents": "data//documents-lab1.txt", "Keywords": "data//keywords-lab1.txt" } def do_search(self, widget, data): query = self.entry.get_text() result = self.tfidf.rank(query) self.result_view.show_documents(result) if self.is_query_expanding_active: new_queries = self.query_expander.expand(query) list = [] for new_query in new_queries: if len(new_query) >= 1: list.append(" ".join(new_query)) self.query_expander_view.show_queries(list) #self.text_area.get_buffer().set_text(self.tfidf.get_result()) def delete_event(self, widget, event, data=None): gtk.main_quit() return False def show_keywords(self): self.keywords_area.get_buffer().set_text(self.tfidf.get_keywords_string()) def toggle_query_expanding(self, widget): self.is_query_expanding_active = not self.is_query_expanding_active if self.is_query_expanding_active: self.query_expander_container.show() else: self.query_expander_container.hide() self.query_expander_view.remove_old_buttons() print self.is_query_expanding_active def open_file(self, widget, name): text = "Select {0} Source File".format(name) filechooserdialog = gtk.FileChooserDialog(text, None, gtk.FILE_CHOOSER_ACTION_OPEN, (gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL, gtk.STOCK_OK, gtk.RESPONSE_OK)) response = filechooserdialog.run() if response == gtk.RESPONSE_OK: self.__directories[name] = filechooserdialog.get_filename() self.tfidf = TfIdf(self.__directories['Documents'], self.__directories['Keywords']) self.query_expander = QueryExpander() self.query_expander.loadKeywords(self.__directories['Keywords']) print "directories" print self.__directories #self.tfidf.print_stemmed_keywords() self.show_keywords() filechooserdialog.destroy() def __init__(self): self.query_expander = QueryExpander() self.query_expander.loadKeywords(self.__directories['Keywords']) #inits window and connects delete event print gtk.pygtk_version self.is_query_expanding_active = True self.tfidf = TfIdf(self.__directories['Documents'], self.__directories['Keywords']) print 'Dokument', self.tfidf.print_documents() self.window = gtk.Window(gtk.WINDOW_TOPLEVEL) self.window.set_title("Czesc Milosz!") self.window.connect("delete_event", self.delete_event) self.window.set_default_size(600,500) self.window.set_border_width(5) #prepare layout self.box1 = gtk.VBox(False, 5) self.window.add(self.box1) document_view = self.get_document_textarea_layout() self.box1.pack_start(self.get_menu_box(), False, False, 0) self.box1.pack_start(self.get_search_panel_layout(), False, False, 0) self.box1.pack_start(self.get_query_expander_view(), True, True, 0) self.box1.pack_start(self.get_result_layaut(), True, True, 0) self.box1.pack_start(document_view, True, True, 0) self.box1.pack_start(self.get_keywords_layout(), True, True, 0) self.show_keywords() #self.tfidf.print_stemmed_keywords() self.window.show_all() def get_query_expander_view(self): box = gtk.VBox() check_box = gtk.CheckButton("Query Expanding") check_box.set_active(True) check_box.connect("clicked", self.toggle_query_expanding) box.pack_start(check_box, False, False, 0) self.query_expander_container = gtk.ScrolledWindow() self.query_expander_container.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC) self.query_expander_view = QueryExpanderView(self) self.query_expander_container.add_with_viewport(self.query_expander_view) box.pack_start(self.query_expander_container, True, True, 0) return box def get_menu_box(self): #menu mb = gtk.MenuBar() filemenu = gtk.Menu() file_item = gtk.MenuItem("File") file_item.set_submenu(filemenu) open_document_item = gtk.MenuItem("Open Documents Source") open_document_item.connect("activate", self.open_file, "Documents") open_keywords_item = gtk.MenuItem("Open Keywords Source") open_keywords_item.connect("activate", self.open_file, "Keywords") exit_item = gtk.MenuItem("Exit") exit_item.connect("activate", gtk.main_quit) filemenu.append(open_document_item) filemenu.append(open_keywords_item) filemenu.append(exit_item) mb.append(file_item) return mb def get_search_panel_layout(self): #prepare layout self.similar_list = [] hbox = gtk.HBox(False, 5) self.entry = gtk.Entry() completion = gtk.EntryCompletion() self.liststore = gtk.ListStore(str) self.entry.set_completion(completion) completion.set_model(self.liststore) completion.set_text_column(0) for item in self.similar_list: self.liststore.append([item]) hbox.pack_start(self.entry, True, True, 0) #prepare search button btn_search = gtk.Button("Szukaj") btn_search.connect("clicked", self.do_search, "button 2") hbox.pack_start(btn_search, True, True, 0) return hbox def get_result_layaut(self): sw = gtk.ScrolledWindow() sw.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC) self.result_view = ResultView(self.document_area) sw.add_with_viewport(self.result_view) #prepare text area for results #self.text_area = gtk.TextView() #sw = gtk.ScrolledWindow() #sw.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC) #self.text_area = gtk.TextView() #sw.add(self.text_area) return sw #def get_bottomarea_layout(self): #bottom = gtk.HBox(False, 0) #bottom.pack_start(self.get_document_textarea_layout(), False, False, 0) #bottom.pack_start(self.get_keywords_layout(), False, False, 0) #return bottom def get_document_textarea_layout(self): #prepare text area for results frame = gtk.Frame() sw = gtk.ScrolledWindow() sw.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC) self.document_area = gtk.TextView() sw.add(self.document_area) frame.set_label("Document:") frame.add(sw) return frame def get_keywords_layout(self): #prepare text area for results frame = gtk.Frame() sw = gtk.ScrolledWindow() sw.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC) self.keywords_area = gtk.TextView() sw.add(self.keywords_area) frame.set_label("Keywords:") frame.add(sw) return frame
with open(dir_ale+"coleta3.pck", 'rb') as data_file: dataset.append(remove_irrelevant(pickle.load(data_file))) print("process tfidf") tfidf_entropy = list() tfidf_smooth = list() tfidf_like = list() for i , data in enumerate(dataset): tmp_smooth = dict() tmp_like = dict() tmp_entropy = dict() print("dataset: " + str(i)) for word in data: tf = TfIdf.tf(word, data) tmp_entropy[word] = tf * TfIdf.idf_entropy(word, i, dataset) tmp_smooth[word] = tf * TfIdf.idf_smooth(word, dataset) tmp_like[word] = tf * TfIdf.idf_like(word, i, dataset) tfidf_smooth.append(tmp_smooth) tfidf_like.append(tmp_like) tfidf_entropy.append(tmp_entropy) print("save tfidf") with open(dir_out+"tfidf_entropy.pck", 'wb') as handle: pickle.dump(tfidf_entropy, handle) with open(dir_out+"tfidf_smooth.pck", 'wb') as handle: pickle.dump(tfidf_smooth, handle)
def idf_like( word,parl_counter, tot_counter,doc_counter, counter_list_parl): return ((math.log2(len(doc_counter))-TfIdf.entropy(word,tot_counter,doc_counter)) *TfIdf.parl_prob(word,parl_counter,doc_counter)*TfIdf.parl_entropy(word, tot_counter, counter_list_parl))
def test_similarity(self): table = TfIdf() table.add_document("doc1", [ "The", "game", "of", "life", "is", "a", "game", "of", "everlasting", "learning" ]) table.add_document( "doc2", ["The", "unexamined", "life", "is", "not", "worth", "living"]) table.add_document("doc3", ["Never", "stop", "learning"]) table.calculate_tf() table.calculate_idf() table.calculate_tf_idf() """self.assertEqual( table.similarities(["life","learning"]), [["foo", 1.0], ["bar", 0.707106781], ["baz", 0.707106781]])""" print(table.similarities(["life", "learning"]))
def idf_pow( word,parl_counter, tot_counter,doc_counter, counter_list_parl,b1,b2): h_max = math.log2(len(doc_counter)) h_word = TfIdf.entropy(word,tot_counter,doc_counter) x = math.pow(2,h_word)/math.pow(2,h_max) return (expon.pdf(h_word,scale=0.2) *TfIdf.parl_prob(word,parl_counter,doc_counter)*beta.pdf(x,b1,b2))
def setUp(self): self.unk_cutoff = 2 self.vocab = TfIdf(unk_cutoff=self.unk_cutoff)
def idf_like( word,parl_counter, tot_counter,doc_counter, counter_list_parl): return ((math.log2(len(doc_counter))-TfIdf.entropy(word,tot_counter,doc_counter)) *TfIdf.parl_prob(word,parl_counter,doc_counter)*TfIdf.parl_entropy(word, tot_counter, counter_list_parl)) if __name__=='__main__': cf = configparser.ConfigParser() cf.read("file_path.properties") path = dict(cf.items("file_path")) dir_in = path['dir_in'] dir_out = path['dir_out'] dir_ale = path['dir_ale'] dir_pck = path['dir_pck'] tfidf = TfIdf() with open(dir_out+"list_alea_bigrams.pck",'rb') as handle: ale_tweets = pickle.load(handle) with open(dir_out+"list_dept_bigrams_.pck",'rb') as handle: parl_tweets = pickle.load(handle) parl_bgr_counter = [l.ngram_fd for l in parl_tweets] docs_bgr_counter = [l.ngram_fd for l in ale_tweets] bgr_counter = dict() for y in parl_bgr_counter: for k in y.keys(): bgr_counter[k] = k in bgr_counter and bgr_counter[k]+y[k] or y[k] docs_bgr_counter.append(bgr_counter) tot_counter = dict()
def __init__(self): self.myTfIdf = TfIdf("corpus10k.txt", "stopwords10k.txt") opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] self.opener = opener
def __init__(self): pio.renderers.default = 'browser' tfidf = TfIdf() self.ids, self.titles, self.matrix = tfidf.get_matrix() self.vectorizer = tfidf.get_vectorizer()
def menu(): print("Que deseja fazer?") print("1 - Consultar a informação do site do jornal ABola") print("2 - Aplicar o algoritmo do TFIDF") print("3 - Sair") word = 0 nword = 0 narray = [] j = 0 for line in fileinput.input(): if line.replace("\n", "") == "1": os.system('python3 web_scraper.py') print("Que deseja fazer?") print("1 - Consultar a informação do site do jornal ABola") print("2 - Aplicar o algoritmo do TFIDF") print("3 - Sair") elif (line.replace("\n", "") == "2") or (word > 0): if word == 0 and j == 0: if (os.path.isdir("artigos") == False): print( 'Necessita de gerar primeiro o conteúdo. Escolha a opção 1' ) print("Que deseja fazer?") print("1 - Consultar a informação do site do jornal ABola") print("2 - Aplicar o algoritmo do TFIDF") print("3 - Sair") else: filesA = os.listdir('artigos') table = TfIdf() for i in filesA: with open('artigos/{}'.format(i), 'r') as content: #print(content.read().split('h2')) val = content.read().split('h2') firstVal = val[0] secondVal = val[1] table.add_document( 'title{}'.format(i), re.sub(r'[\W]', ' ', firstVal).lower().split()) table.add_document( 'text{}'.format(i), re.sub(r'[\W]', ' ', secondVal).lower().split()) word += 1 print('Indique quantas palavras quer comparar:') elif (word == 1) and (j == 0): if (line.replace("\n", "").isnumeric() and int(line) > 1): nword = int(line) word += 1 else: print('Digite um número maior que 1') elif (word > 1) and (word <= nword) and (j == 0): if (line.replace("\n", "") != ''): narray.append(line.replace("\n", "").lower()) word += 1 else: j = 1 if (j == 1): if line.replace("\n", "") != '': narray.append(line.replace("\n", "").lower()) j += 1 if (j == 2): print(narray) fTDIDF = open('output' + narray[0] + '.html', 'w+') fTDIDF.write( '<h2>Resultados da aplicação do algoritmo:<h2>') splitArray = {} for s in table.similarities(narray): if s[0].startswith('title'): s[0] = s[0].replace('title', '') if s[0] in splitArray.keys(): splitArray[s[0]] += s[1] * 0.7 else: splitArray[s[0]] = s[1] * 0.7 elif s[0].startswith('text'): s[0] = s[0].replace('text', '') if s[0] in splitArray.keys(): splitArray[s[0]] += s[1] * 0.3 else: splitArray[s[0]] = s[1] * 0.3 for elem in splitArray.keys(): fTDIDF.write( '<p><h5><a href="artigos/{}" >'.format(elem) + elem + '</a> -> ' + str(splitArray[elem]) + '</h5></p>') new = 2 # open in a new tab, if possible url = "file:///home/ze/SPLN/WebScraper/output" + narray[ 0] + ".html" webbrowser.open(url, new=new) word = 0 nword = 0 narray = [] print("Que deseja fazer?") print("1 - Consultar a informação do site do jornal ABola") print("2 - Aplicar o algoritmo do TFIDF") print("3 - Sair") elif (line.replace("\n", "") == "3") and (word == 0): print("Obrigado pela sua visita") fileinput.close()
tweets = list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(parl_tweets)))) tot_counter = Counter(tweets) docs_counter = list() for alea_tw in alea_tweets: tw = list(itertools.chain.from_iterable(alea_tw)) docs_counter.append(Counter(tw)) docs_counter.append(tot_counter) parl_counters = list() for parl in parl_tweets: tw = list(itertools.chain.from_iterable(parl)) parl_counters.append(Counter(tw)) tfidf = TfIdf() tfidf_like_bi_trigrams = list() for word in tot_counter: tfidf_like_bi_trigrams.append(tfidf.tf(word,tot_counter)*tfidf.idf_like(word,tot_counter,tot_counter,docs_counter, parl_counters)) sort_tfidf_like = list(zip(tot_counter.keys(), tfidf_like_bi_trigrams)) sort_tfidf_like = sorted(sort_tfidf_like, key=lambda x: x[1], reverse=True) with open(dir_out+"sort_tfidf_like_bi_trigram.pck", 'wb') as handle: pickle.dump(sort_tfidf_like, handle)