def __init__(self, taxonomy, protein): self.Taxonomy = taxonomy self.Protein = protein self.ncbi_api = Retrieve() self.tools = Tools() self.dataset = None self.fasta = None self.summary = None
def __init__(self, user, model): super().__init__() self.user = user self.MODEL = model self.update_list = [] self.R = Retrieve(self.user, self.MODEL) self.conn = sqlite3.connect(PATH+'full_database.db') self.c = self.conn.cursor() self.initUI()
def __init__(self): self.BiasScrap = PolBiasScraper() self.ArticleScrap = ArticleScrapper() self.Retrie = Retrieve() self.dateTimeObj = datetime.now() self.timestamp = self.dateTimeObj.strftime("%Y-%m-%d-%H-%M") self.datasetSize = 20 self.rankedsetSize = 10 self.maxtags = 30 self.politicweight = 0.95 self.maxageseconds = 604800 self.weight = { "age": 0.12, "tags": 0.24, "polarity": 0.21, "subjectivity": 0.13, "credibility": 0.13, "bias": 0.17 }
def class_report(user_id, model): R = Retrieve(user_id, model) topics = R.get_topics() all_similarity_results = [] for topic in topics: vectors = R.get_vectorsUP(topic) mean_vector = R.calculateMean(vectors) similarity_results = R.computeSimilarity(mean_vector,15) all_similarity_results.append(similarity_results) results_dict = {k: v for d in all_similarity_results for k, v in d.items()} results_dict_ordered = {k: v for k, v in sorted(results_dict.items(), key=lambda item: item[1])} top_n = 20 top_n_dict = dict(Counter(results_dict_ordered).most_common(top_n)) keys = [k for k, v in top_n_dict.items()] topics_retr = [] for i in range(0, top_n): c.execute("SELECT topic FROM Tweets WHERE tweet_id=? ",(keys[i],)) topics_retr.append(c.fetchone()[0].decode('utf-8')) print(topics_retr) print('\n') print(topics) print('\n') print(top_n_dict) binary_pred = [1 for i in range(0,20)] binary_real = [] for topic in topics_retr: if topic in topics: binary_real.append(1) else: binary_real.append(0) print(classification_report(binary_real, binary_pred))
with open(self.outfile, 'w') as out: for (qid, docids) in self.results: for docid in docids: print(qid, docid, file=out) #============================================================================== # MAIN if __name__ == '__main__': config = CommandLine() print(config.__dict__) if config.exit: sys.exit(0) index = IndexLoader(config.indexFile).getIndex() retrieve = Retrieve(index, config.termWeighting) queries = Queries(config.queriesFile) allResults = ResultStore(config.outfile) t = MyTimer() t.start('retrieval') for qid in (queries.qids()): query = queries.getQuery(qid) results = retrieve.forQuery(query) allResults.store(qid, results) t.stopPrint('retrieval') allResults.output()
def parse_arguments(): parser = argparse.ArgumentParser() parser.add_argument('-i', dest='ip_address') parser.add_argument('-p', dest='port', default=9998) args = parser.parse_args() return int(args.port), args.ip_address.strip() if __name__ == '__main__': port, ip = parse_arguments() connection = listen_for_connection(port, ip) flag = process_first_transmission(connection) if flag == 1: # client side script is request.py print("retrieving files...") retrieve = Retrieve(connection) retrieve.process_request() print("complete") elif flag == 2: # client side script is upload_more.py print("uploading additional files...") store_more = StoreMore(connection) store_more.send_data() store_more.store_id_to_name_db() store_more.store_word_to_id_db(1) store_more.store_enc_files() else: # client side script is upload.py print("uploading files...") store = Store(connection) store.store_id_to_name_db()
import datetime from retrieve import Retrieve from time import sleep backup=Retrieve("password") scheduler_state = "" # backup_state = "" scheduler_state = backup.get_scheduler_state() while scheduler_state != "off": # get the interval time to apply changes at each loop, if there's one. interval = int(backup.get_interval()) # get the scheduler_state in state variable, to be check by the while loop scheduler_state = backup.get_scheduler_state() # if the backup_state is on, proceed to the backup of all devices, else do nothing if backup.get_backup_state() == "on": date_start = datetime.datetime.now() print("Backuping now at " + str(date_start)) backup.backup_all_device() date_stop = datetime.datetime.now() print("End of the backup at "+ str(date_stop))
class App: #Initializer / instance attributes def __init__(self, taxonomy, protein): self.Taxonomy = taxonomy self.Protein = protein self.ncbi_api = Retrieve() self.tools = Tools() self.dataset = None self.fasta = None self.summary = None @classmethod def from_class(cls): return cls(User_input.from_input("taxonomy"), User_input.from_input("protein")) @property def taxon_query(self): return self.Taxonomy.val @taxon_query.setter def taxon_query(self, inp): if inp != "taxonomy": self.Taxonomy = User_input.from_param(inp, "taxonomy") else: self.Taxonomy = User_input.from_input("taxonomy") @property def protein_query(self): return self.Protein.val @protein_query.setter def protein_query(self, inp): self.Protein = User_input.from_input("protein") def total_species(self): return len(self.dataset.keys()) def total_seqs(self): return sum(1 for species in self.dataset for acc in self.dataset[species]) def get_taxa(self): #given self.taxon_query, return list of return self.ncbi_api.get_taxa(self.taxon_query, "Taxonomy") def plot(self): self.tools.plot() def write(self, fasta, alt=""): if fasta: self.tools.write(fasta, self.protein_query, self.taxon_query) else: print("Missing fasta file! Please run get_fasta first.") def taxa(self, typ="all"): # gets list of all taxa produced from search self.dataset = self.ncbi_api.taxa_protein_dict(self.get_fasta(), typ) def get_summary(self): self.summary = self.ncbi_api.summary(self.protein_query, self.taxon_query) #TODO refactor to be property def get_fasta(self): #initiates ncbi search using esearch and efetch self.fasta = self.ncbi_api.retrieve(self.protein_query, self.taxon_query) return self.fasta
def form_pun(self, eval_path): retrieve = Retrieve(sentence_path=TEXT_DATA_DIR + TEXT_DATA, pun_path=PUN_DATA_DIR + PUN_DATA) (pun, sentence, score) = retrieve.retrieve() if not sentence: print("No sentence with word {} was found. Exiting...".format( pun[1])) raise Exception() text = word_tokenize(sentence) tokenized = nltk.pos_tag(text) print(tokenized) print(sentence, pun[0], pun[1]) pre = self.tokenizer.texts_to_sequences([sentence]) wp = self.tokenizer.texts_to_sequences([pun[0]]) wa = self.tokenizer.texts_to_sequences([pun[1]]) if (not wa[0]) or (not wp[0]): print( "The pair of pun and word does not exist in the parsed corpus. Exit..." ) raise Exception() index_wa = -1 for seq in pre[0]: index_wa = index_wa + 1 if seq == wa[0][0]: pre[0][index_wa] = wp[0][0] break wordsimilarity = WordSimilarity() wordsimilarity.word2vec() wordsimilarity.load() try_limit = 5 try_count = 0 index_topic = 0 while True: try: topic_word = None for i in range(index_topic, len(tokenized)): (word, pos) = tokenized[i] if (pos == 'NNP'): topic_word = "man" print(word, pos) index_topic = index_topic + 1 break if (pos == 'NN') or (pos == 'PRP') or (pos == 'NNS') or ( pos == 'PRP$'): topic_word = word print(word, pos) index_topic = index_topic + 1 break index_topic = index_topic + 1 result = wordsimilarity.getSimilar([topic_word, pun[0]], [pun[1]], 10) other_result = wordsimilarity.getSimilar([pun[0]], [], 10) break except KeyError: print("Word {} is not in vocabulary, try with the next one". format(topic_word)) try_count = try_count + 1 if try_limit == try_count: print("Limit of trys has been reached. Exit...") raise Exception() eval_surprisal = Evaluate() eval_surprisal.load_model(eval_path) finals = [] mean_amalgam = 0 for (word, prob) in result: swap = self.tokenizer.texts_to_sequences([word]) context_window = 2 surprise = eval_surprisal.compute_surpisal( sentence=pre[0], pun_word=wa[0][0], pun_alternative=wp[0][0], context_window=context_window) mean_amalgam = mean_amalgam + surprise print(surprise) pre[0][index_topic] = swap[0][0] post_simple = self.tokenizer.sequences_to_texts([pre[0]]) print(post_simple) pre[0][index_topic + 1] = 0 if index_topic >= 2: pre[0][index_topic - 1] = 0 post_smoothing = self.dac.inference(pre[0]) post_smoothing = self.tokenizer.sequences_to_texts( post_smoothing.tolist()) finals.append(post_smoothing) print(post_smoothing) print(finals) print(mean_amalgam / 10) other_finals = [] mean_similar = 0 for (word, prob) in other_result: swap = self.tokenizer.texts_to_sequences([word]) context_window = 2 surprise = eval_surprisal.compute_surpisal( sentence=pre[0], pun_word=wa[0][0], pun_alternative=wp[0][0], context_window=context_window) mean_similar = mean_similar + surprise print(surprise) pre[0][index_topic] = swap[0][0] post_simple = self.tokenizer.sequences_to_texts([pre[0]]) print(post_simple) pre[0][index_topic + 1] = 0 if index_topic >= 2: pre[0][index_topic - 1] = 0 post_smoothing = self.dac.inference(pre[0]) post_smoothing = self.tokenizer.sequences_to_texts( post_smoothing.tolist()) other_finals.append(post_smoothing) print(post_smoothing) print(other_finals) print(mean_similar / 10) return finals.extend(other_finals)
class main(): def __init__(self): self.BiasScrap = PolBiasScraper() self.ArticleScrap = ArticleScrapper() self.Retrie = Retrieve() self.dateTimeObj = datetime.now() self.timestamp = self.dateTimeObj.strftime("%Y-%m-%d-%H-%M") self.datasetSize = 20 self.rankedsetSize = 10 self.maxtags = 30 self.politicweight = 0.95 self.maxageseconds = 604800 self.weight = { "age": 0.12, "tags": 0.24, "polarity": 0.21, "subjectivity": 0.13, "credibility": 0.13, "bias": 0.17 } def percentage(self, article): unweighted = { "age": 0, "tags": 0, "polarity": 0, "subjectivity": 0, "credibility": 0, "bias": 0 } # age x = (article["metricscores"]["age"] / self.maxageseconds) unweighted["age"] = (x + 1)**(-5 * x) # tags if article["metricscores"]["tags"] > 1: unweighted["tags"] = self.politicweight + ( (article["metricscores"]["tags"] - 1) * (1 - self.politicweight)) else: unweighted["tags"] = (article["metricscores"]["tags"]) * ( 1 - self.politicweight) # polarity x = abs(article["metricscores"]["polarity"]) unweighted["polarity"] = x # subjectivity unweighted["subjectivity"] = article["metricscores"]["polarity"] # credibility unweighted["credibility"] = article["metricscores"]["credibility"] / 10 # bias unweighted["bias"] = article["metricscores"]["bias"] / 5 return unweighted def mediaBias(self, article): article["metrics"].update(self.ArticleScrap.parseURL(article["url"])) buildUrlFacts = "https://mediabiasfactcheck.com/" + article[ "media_name"].lower().replace(" ", "-") article["metrics"].update(self.BiasScrap.parseURL(buildUrlFacts)) def tagQuantification(self, article): metricscores = {} for x in article["metrics"]: if x == "tags": metricscores[x] = 0 if "politics and government" in article["metrics"]["tags"]: metricscores[x] += 1 if len(article["metrics"]["tags"]) < 31: metricscores[x] += len(article["metrics"]["tags"]) / 30 else: metricscores[x] += 1 else: metricscores[x] = article["metrics"][x] return metricscores def retrieveData(self): data = self.Retrie.retrieve(self.timestamp, storylimit=self.datasetSize) data = data['articles'][:self.datasetSize] unrankeddata = {} for index, article in enumerate(data): print(index + 1) self.mediaBias(article) article["metricscores"] = self.tagQuantification(article) # weights unweighted = self.percentage(article) score = 0 for metric in unweighted: score += unweighted[metric] * self.weight[metric] article["score"] = abs(score) unrankeddata[abs(score)] = article rankeddata = {} for index, article in enumerate( sorted(unrankeddata.keys(), reverse=True)): rankeddata[index + 1] = unrankeddata[article] if index == self.rankedsetSize: break with open("json/data_{0}.json".format(self.timestamp), "w") as file: json.dump(rankeddata, file) return rankeddata
class RecommendList(QWidget): def __init__(self, user, model): super().__init__() self.user = user self.MODEL = model self.update_list = [] self.R = Retrieve(self.user, self.MODEL) self.conn = sqlite3.connect(PATH+'full_database.db') self.c = self.conn.cursor() self.initUI() ''' We have to define a database connection here. Also we have to import our preprocess class and our class for obtaining the items' list ''' def initUI(self): self.resize(1450, 800) self.setLayout(QVBoxLayout()) h_layout = QHBoxLayout() self.layout().addLayout(h_layout) # Call the function to load the table with tweets self.load_table() label = QLabel('Hey! Check out these Tweets!', self) h_layout.addWidget(label) button = QPushButton('RELOAD', self) button.setToolTip('This button will <b>Reload</b> the current window using the selected '+ 'items to improve the system.') button.clicked.connect(self.reload) self.layout().addWidget(button) back = QPushButton('Back', self) back.setToolTip('This button will bring you back to the user selection window.') back.clicked.connect(self.goBack) self.layout().addWidget(back) self.center() self.setWindowTitle('Tweets Recommendation List') self.show() def center(self): qr = self.frameGeometry() cp = QDesktopWidget().availableGeometry().center() qr.moveCenter(cp) self.move(qr.topLeft()) def onStateChange_check(self, item): # the listener is on every cell of the table # we are interested in just when the checkbox is checked if item.checkState() == Qt.Checked: # first i need to know the row - position row_id = item.row() # then i have self.keys and i can get its key tweet_id = self.keys[row_id] # store this id in a global variable and update when the reload button is pressed self.update_list.append(tweet_id) GLOBAL_BLACK_LIST.add(tweet_id) elif item.checkState() == Qt.Unchecked: row_id = item.row() tweet_id = self.keys[row_id] if(tweet_id in self.update_list): self.update_list.remove(tweet_id) GLOBAL_BLACK_LIST.remove(tweet_id) def reload(self): alert = QMessageBox() alert.setText('This window will be reloaded!') alert.exec_() # Call the update user profile function self.R.pushUP(self.update_list) # After pressing the ok button in the alert, the window will be updated self.w = RecommendList(self.user, self.MODEL) self.w.show() self.hide() def goBack(self): self.w = LoginScreen() self.w.show() self.hide() ''' This function return a list of tweets to show up in the table ''' def getVectors(self): n_topic = self.R.get_countTopic() topics = self.R.get_topics() all_similarity_results = [] # list of dicts of N = n_topics elements for topic in topics: # get vectors by topic in the user profile vectors = self.R.get_vectorsUP(topic) #print(vectors) # get the mean mean_vector = self.R.calculateMean(vectors) # compute similarity similarity_results = self.R.computeSimilarity(mean_vector,20) # remove the black listed ids similarity_results = {k: v for k, v in similarity_results.items() if k not in GLOBAL_BLACK_LIST} # append to the general list all_similarity_results.append(similarity_results) # Merge results into a single dict results_dict = {k: v for d in all_similarity_results for k, v in d.items()} # Order by value results_dict_ordered = {k: v for k, v in sorted(results_dict.items(), key=lambda item: item[1], reverse=True)} # Obtain only top N ids top_n = 19 top_n_dict = dict(Counter(results_dict_ordered).most_common(top_n)) # Now order by considering the date top_n_dict, top_n_list = self.R.ranking(top_n_dict) self.keys = [k for k, v in top_n_dict.items()] # Obtain the 20th number from the Long-tail ran_value = randint(20, len(results_dict_ordered)-1) count = 0 for k, v in results_dict_ordered.items(): if(count==ran_value): ran_key = k ran_similarity = v count=count+1 #tail = {ran_key: ran_similarity} # Retrieve Tweets tweets = [] dates = [] for i in range(0, top_n): self.c.execute("SELECT tweet FROM Tweets WHERE tweet_id=? ",(self.keys[i],)) tweets.append(self.c.fetchone()) self.c.execute("SELECT date FROM Tweets WHERE tweet_id=? ",(self.keys[i],)) dates.append(self.c.fetchone()[0]) # Get the last one from the tail self.c.execute("SELECT tweet FROM Tweets WHERE tweet_id=? ",(ran_key,)) tweets.append(self.c.fetchone()) self.c.execute("SELECT date FROM Tweets WHERE tweet_id=? ",(ran_key,)) dates.append(self.c.fetchone()[0]) self.keys.append(ran_key) return tweets, dates, top_n_list, ran_similarity def load_table(self): columns = 4 rows, dates, top_n_list, tail_similarity = self.getVectors() rows = self.R.cleanRows(rows) n_rows = len(rows) self.table = QTableWidget(n_rows, columns, self) self.table.setHorizontalHeaderLabels(["Tweet", "Do you like it?", "Tweet date", "Similarity"]) header = self.table.horizontalHeader() header.setSectionResizeMode(0, QHeaderView.ResizeToContents) #self.table.setResizeMode(QHeaderView.ResizeToContents) for row in rows: inx = rows.index(row) self.table.insertRow(inx) self.chkBoxItem = QTableWidgetItem('I Like it!') self.chkBoxItem.setFlags(Qt.ItemIsUserCheckable | Qt.ItemIsEnabled) self.chkBoxItem.setCheckState(Qt.Unchecked) self.chkBoxItem.setToolTip('Help to <b>Improve</b> our system selecting the items you like.') # Listener on the checkbox self.table.itemClicked.connect(self.onStateChange_check) #self.table.setItem(inx, 0, QTableWidgetItem(str(row[0].decode("utf-8")))) self.table.setItem(inx, 0, QTableWidgetItem(row)) self.table.setItem(inx, 1, self.chkBoxItem) self.table.setItem(inx, 2, QTableWidgetItem(dates[inx])) if(inx == (n_rows-1)): self.table.setItem(inx, 3, QTableWidgetItem("{:1.5f}".format(tail_similarity))) else : #self.table.setItem(inx, 3, QTableWidgetItem(str(top_n_list[inx]))) self.table.setItem(inx, 3, QTableWidgetItem("{:1.5f}".format(top_n_list[inx]))) self.layout().addWidget(self.table) def closeEvent(self, event): reply = QMessageBox.question(self, 'Message', "Are you sure to quit?", QMessageBox.Yes | QMessageBox.No, QMessageBox.No) if reply == QMessageBox.Yes: event.accept() else: event.ignore()