예제 #1
0
 def __init__(self, taxonomy, protein):
     self.Taxonomy = taxonomy
     self.Protein = protein
     self.ncbi_api = Retrieve()
     self.tools = Tools()
     self.dataset = None
     self.fasta = None
     self.summary = None
예제 #2
0
	def __init__(self, user, model):
		super().__init__()
		self.user = user
		self.MODEL = model
		self.update_list = []
		self.R = Retrieve(self.user, self.MODEL)
		self.conn = sqlite3.connect(PATH+'full_database.db')
		self.c = self.conn.cursor()
		self.initUI()
예제 #3
0
    def __init__(self):
        self.BiasScrap = PolBiasScraper()
        self.ArticleScrap = ArticleScrapper()
        self.Retrie = Retrieve()
        self.dateTimeObj = datetime.now()
        self.timestamp = self.dateTimeObj.strftime("%Y-%m-%d-%H-%M")
        self.datasetSize = 20
        self.rankedsetSize = 10
        self.maxtags = 30
        self.politicweight = 0.95
        self.maxageseconds = 604800

        self.weight = {
            "age": 0.12,
            "tags": 0.24,
            "polarity": 0.21,
            "subjectivity": 0.13,
            "credibility": 0.13,
            "bias": 0.17
        }
def class_report(user_id, model):
	R = Retrieve(user_id, model)
	topics = R.get_topics()

	all_similarity_results = []

	for topic in topics:
		vectors = R.get_vectorsUP(topic)
		mean_vector = R.calculateMean(vectors)
		similarity_results = R.computeSimilarity(mean_vector,15)
		all_similarity_results.append(similarity_results)

	results_dict = {k: v for d in all_similarity_results for k, v in d.items()}

	results_dict_ordered = {k: v for k, v in sorted(results_dict.items(), key=lambda item: item[1])}
	top_n = 20
	top_n_dict = dict(Counter(results_dict_ordered).most_common(top_n))
	keys = [k for k, v in top_n_dict.items()]

	topics_retr = []
	for i in range(0, top_n):
		c.execute("SELECT topic FROM Tweets WHERE tweet_id=? ",(keys[i],))
		topics_retr.append(c.fetchone()[0].decode('utf-8'))

	print(topics_retr)
	print('\n')
	print(topics)
	print('\n')
	print(top_n_dict)
	binary_pred = [1 for i in range(0,20)]
	binary_real = []
	for topic in topics_retr:
		if topic in topics:
			binary_real.append(1)
		else:
			binary_real.append(0)

	print(classification_report(binary_real, binary_pred))
예제 #5
0
        with open(self.outfile, 'w') as out:
            for (qid, docids) in self.results:
                for docid in docids:
                    print(qid, docid, file=out)

#==============================================================================
# MAIN

if __name__ == '__main__':

    config = CommandLine()
    print(config.__dict__)
    if config.exit:
        sys.exit(0)        
    index = IndexLoader(config.indexFile).getIndex()
    retrieve = Retrieve(index, config.termWeighting)
    queries = Queries(config.queriesFile)
    allResults = ResultStore(config.outfile)

    t = MyTimer()
    t.start('retrieval')

    for qid in (queries.qids()):
        query = queries.getQuery(qid)
        results = retrieve.forQuery(query)
        allResults.store(qid, results)

    t.stopPrint('retrieval')    
    allResults.output()

def parse_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', dest='ip_address')
    parser.add_argument('-p', dest='port', default=9998)
    args = parser.parse_args()
    return int(args.port), args.ip_address.strip()


if __name__ == '__main__':
    port, ip = parse_arguments()
    connection = listen_for_connection(port, ip)
    flag = process_first_transmission(connection)
    if flag == 1:
        # client side script is request.py
        print("retrieving files...")
        retrieve = Retrieve(connection)
        retrieve.process_request()
        print("complete")
    elif flag == 2:
        # client side script is upload_more.py
        print("uploading additional files...")
        store_more = StoreMore(connection)
        store_more.send_data()
        store_more.store_id_to_name_db()
        store_more.store_word_to_id_db(1)
        store_more.store_enc_files()
    else:
        # client side script is upload.py
        print("uploading files...")
        store = Store(connection)
        store.store_id_to_name_db()
예제 #7
0
import datetime
from retrieve import Retrieve
from time import sleep


backup=Retrieve("password")


scheduler_state = ""
# backup_state = ""
scheduler_state = backup.get_scheduler_state()


while scheduler_state != "off":

    # get the interval time to apply changes at each loop, if there's one.
    interval = int(backup.get_interval())

    # get the scheduler_state in state variable, to be check by the while loop
    scheduler_state = backup.get_scheduler_state()

    # if the backup_state is on, proceed to the backup of all devices, else do nothing
    if backup.get_backup_state() == "on":

        date_start = datetime.datetime.now()
        print("Backuping now at " + str(date_start))
        backup.backup_all_device()

        date_stop = datetime.datetime.now()
        print("End of the backup at "+ str(date_stop))
예제 #8
0
class App:
    #Initializer / instance attributes
    def __init__(self, taxonomy, protein):
        self.Taxonomy = taxonomy
        self.Protein = protein
        self.ncbi_api = Retrieve()
        self.tools = Tools()
        self.dataset = None
        self.fasta = None
        self.summary = None

    @classmethod
    def from_class(cls):
        return cls(User_input.from_input("taxonomy"),
                   User_input.from_input("protein"))

    @property
    def taxon_query(self):
        return self.Taxonomy.val

    @taxon_query.setter
    def taxon_query(self, inp):
        if inp != "taxonomy":
            self.Taxonomy = User_input.from_param(inp, "taxonomy")
        else:
            self.Taxonomy = User_input.from_input("taxonomy")

    @property
    def protein_query(self):
        return self.Protein.val

    @protein_query.setter
    def protein_query(self, inp):
        self.Protein = User_input.from_input("protein")

    def total_species(self):
        return len(self.dataset.keys())

    def total_seqs(self):
        return sum(1 for species in self.dataset
                   for acc in self.dataset[species])

    def get_taxa(self):
        #given self.taxon_query, return list of
        return self.ncbi_api.get_taxa(self.taxon_query, "Taxonomy")

    def plot(self):
        self.tools.plot()

    def write(self, fasta, alt=""):
        if fasta:
            self.tools.write(fasta, self.protein_query, self.taxon_query)
        else:
            print("Missing fasta file! Please run get_fasta first.")

    def taxa(self, typ="all"):
        #	gets list of all taxa produced from search
        self.dataset = self.ncbi_api.taxa_protein_dict(self.get_fasta(), typ)

    def get_summary(self):
        self.summary = self.ncbi_api.summary(self.protein_query,
                                             self.taxon_query)

    #TODO refactor to be property
    def get_fasta(self):
        #initiates ncbi search using esearch and efetch
        self.fasta = self.ncbi_api.retrieve(self.protein_query,
                                            self.taxon_query)
        return self.fasta
예제 #9
0
    def form_pun(self, eval_path):
        retrieve = Retrieve(sentence_path=TEXT_DATA_DIR + TEXT_DATA,
                            pun_path=PUN_DATA_DIR + PUN_DATA)
        (pun, sentence, score) = retrieve.retrieve()

        if not sentence:
            print("No sentence with word {} was found. Exiting...".format(
                pun[1]))
            raise Exception()

        text = word_tokenize(sentence)
        tokenized = nltk.pos_tag(text)

        print(tokenized)
        print(sentence, pun[0], pun[1])
        pre = self.tokenizer.texts_to_sequences([sentence])
        wp = self.tokenizer.texts_to_sequences([pun[0]])
        wa = self.tokenizer.texts_to_sequences([pun[1]])

        if (not wa[0]) or (not wp[0]):
            print(
                "The pair of pun and word does not exist in the parsed corpus. Exit..."
            )
            raise Exception()

        index_wa = -1
        for seq in pre[0]:
            index_wa = index_wa + 1
            if seq == wa[0][0]:
                pre[0][index_wa] = wp[0][0]
                break

        wordsimilarity = WordSimilarity()
        wordsimilarity.word2vec()
        wordsimilarity.load()

        try_limit = 5
        try_count = 0
        index_topic = 0
        while True:
            try:
                topic_word = None
                for i in range(index_topic, len(tokenized)):
                    (word, pos) = tokenized[i]
                    if (pos == 'NNP'):
                        topic_word = "man"
                        print(word, pos)
                        index_topic = index_topic + 1
                        break

                    if (pos == 'NN') or (pos == 'PRP') or (pos == 'NNS') or (
                            pos == 'PRP$'):
                        topic_word = word
                        print(word, pos)
                        index_topic = index_topic + 1
                        break
                    index_topic = index_topic + 1

                result = wordsimilarity.getSimilar([topic_word, pun[0]],
                                                   [pun[1]], 10)
                other_result = wordsimilarity.getSimilar([pun[0]], [], 10)

                break
            except KeyError:
                print("Word {} is not in vocabulary, try with the next one".
                      format(topic_word))
                try_count = try_count + 1
                if try_limit == try_count:
                    print("Limit of trys has been reached. Exit...")
                    raise Exception()

        eval_surprisal = Evaluate()
        eval_surprisal.load_model(eval_path)

        finals = []
        mean_amalgam = 0
        for (word, prob) in result:
            swap = self.tokenizer.texts_to_sequences([word])

            context_window = 2
            surprise = eval_surprisal.compute_surpisal(
                sentence=pre[0],
                pun_word=wa[0][0],
                pun_alternative=wp[0][0],
                context_window=context_window)
            mean_amalgam = mean_amalgam + surprise
            print(surprise)

            pre[0][index_topic] = swap[0][0]

            post_simple = self.tokenizer.sequences_to_texts([pre[0]])
            print(post_simple)

            pre[0][index_topic + 1] = 0
            if index_topic >= 2:
                pre[0][index_topic - 1] = 0

            post_smoothing = self.dac.inference(pre[0])
            post_smoothing = self.tokenizer.sequences_to_texts(
                post_smoothing.tolist())
            finals.append(post_smoothing)
            print(post_smoothing)
        print(finals)
        print(mean_amalgam / 10)

        other_finals = []
        mean_similar = 0
        for (word, prob) in other_result:
            swap = self.tokenizer.texts_to_sequences([word])

            context_window = 2
            surprise = eval_surprisal.compute_surpisal(
                sentence=pre[0],
                pun_word=wa[0][0],
                pun_alternative=wp[0][0],
                context_window=context_window)
            mean_similar = mean_similar + surprise
            print(surprise)

            pre[0][index_topic] = swap[0][0]

            post_simple = self.tokenizer.sequences_to_texts([pre[0]])
            print(post_simple)

            pre[0][index_topic + 1] = 0
            if index_topic >= 2:
                pre[0][index_topic - 1] = 0

            post_smoothing = self.dac.inference(pre[0])
            post_smoothing = self.tokenizer.sequences_to_texts(
                post_smoothing.tolist())
            other_finals.append(post_smoothing)
            print(post_smoothing)
        print(other_finals)
        print(mean_similar / 10)

        return finals.extend(other_finals)
예제 #10
0
class main():
    def __init__(self):
        self.BiasScrap = PolBiasScraper()
        self.ArticleScrap = ArticleScrapper()
        self.Retrie = Retrieve()
        self.dateTimeObj = datetime.now()
        self.timestamp = self.dateTimeObj.strftime("%Y-%m-%d-%H-%M")
        self.datasetSize = 20
        self.rankedsetSize = 10
        self.maxtags = 30
        self.politicweight = 0.95
        self.maxageseconds = 604800

        self.weight = {
            "age": 0.12,
            "tags": 0.24,
            "polarity": 0.21,
            "subjectivity": 0.13,
            "credibility": 0.13,
            "bias": 0.17
        }

    def percentage(self, article):
        unweighted = {
            "age": 0,
            "tags": 0,
            "polarity": 0,
            "subjectivity": 0,
            "credibility": 0,
            "bias": 0
        }

        # age
        x = (article["metricscores"]["age"] / self.maxageseconds)
        unweighted["age"] = (x + 1)**(-5 * x)

        # tags
        if article["metricscores"]["tags"] > 1:
            unweighted["tags"] = self.politicweight + (
                (article["metricscores"]["tags"] - 1) *
                (1 - self.politicweight))
        else:
            unweighted["tags"] = (article["metricscores"]["tags"]) * (
                1 - self.politicweight)

        # polarity
        x = abs(article["metricscores"]["polarity"])
        unweighted["polarity"] = x

        # subjectivity
        unweighted["subjectivity"] = article["metricscores"]["polarity"]

        # credibility
        unweighted["credibility"] = article["metricscores"]["credibility"] / 10

        # bias
        unweighted["bias"] = article["metricscores"]["bias"] / 5

        return unweighted

    def mediaBias(self, article):
        article["metrics"].update(self.ArticleScrap.parseURL(article["url"]))
        buildUrlFacts = "https://mediabiasfactcheck.com/" + article[
            "media_name"].lower().replace(" ", "-")
        article["metrics"].update(self.BiasScrap.parseURL(buildUrlFacts))

    def tagQuantification(self, article):
        metricscores = {}
        for x in article["metrics"]:
            if x == "tags":
                metricscores[x] = 0
                if "politics and government" in article["metrics"]["tags"]:
                    metricscores[x] += 1
                if len(article["metrics"]["tags"]) < 31:
                    metricscores[x] += len(article["metrics"]["tags"]) / 30
                else:
                    metricscores[x] += 1
            else:
                metricscores[x] = article["metrics"][x]
        return metricscores

    def retrieveData(self):
        data = self.Retrie.retrieve(self.timestamp,
                                    storylimit=self.datasetSize)
        data = data['articles'][:self.datasetSize]

        unrankeddata = {}

        for index, article in enumerate(data):
            print(index + 1)

            self.mediaBias(article)

            article["metricscores"] = self.tagQuantification(article)

            # weights
            unweighted = self.percentage(article)

            score = 0
            for metric in unweighted:
                score += unweighted[metric] * self.weight[metric]

            article["score"] = abs(score)
            unrankeddata[abs(score)] = article

        rankeddata = {}

        for index, article in enumerate(
                sorted(unrankeddata.keys(), reverse=True)):
            rankeddata[index + 1] = unrankeddata[article]
            if index == self.rankedsetSize:
                break
        with open("json/data_{0}.json".format(self.timestamp), "w") as file:
            json.dump(rankeddata, file)

        return rankeddata
예제 #11
0
class RecommendList(QWidget):
	def __init__(self, user, model):
		super().__init__()
		self.user = user
		self.MODEL = model
		self.update_list = []
		self.R = Retrieve(self.user, self.MODEL)
		self.conn = sqlite3.connect(PATH+'full_database.db')
		self.c = self.conn.cursor()
		self.initUI()

	'''
	We have to define a database connection here.
	Also we have to import our preprocess class and our class for obtaining the items' list
	'''
	def initUI(self):
		self.resize(1450, 800)
		self.setLayout(QVBoxLayout())
		h_layout = QHBoxLayout()
		self.layout().addLayout(h_layout)
		
		# Call the function to load the table with tweets
		self.load_table()
		
		label = QLabel('Hey! Check out these Tweets!', self)
		h_layout.addWidget(label)

		button = QPushButton('RELOAD', self)
		button.setToolTip('This button will <b>Reload</b> the current window using the selected '+
						'items to improve the system.')
		
		button.clicked.connect(self.reload)
		self.layout().addWidget(button)
		
		back = QPushButton('Back', self)
		back.setToolTip('This button will bring you back to the user selection window.')
		back.clicked.connect(self.goBack)
		self.layout().addWidget(back)
		
		self.center()
		
		self.setWindowTitle('Tweets Recommendation List')
		self.show()
	
	def center(self):
		qr = self.frameGeometry()
		cp = QDesktopWidget().availableGeometry().center()
		qr.moveCenter(cp)
		self.move(qr.topLeft())
		
	def onStateChange_check(self, item):
		# the listener is on every cell of the table
		# we are interested in just when the checkbox is checked
		if item.checkState() == Qt.Checked:
			# first i need to know the row - position
			row_id = item.row()
			# then i have self.keys and i can get its key
			tweet_id = self.keys[row_id]
			# store this id in a global variable and update when the reload button is pressed
			self.update_list.append(tweet_id)
			GLOBAL_BLACK_LIST.add(tweet_id)
			
		elif item.checkState() == Qt.Unchecked:
			row_id = item.row()
			tweet_id = self.keys[row_id]
			if(tweet_id in self.update_list):
				self.update_list.remove(tweet_id)
				GLOBAL_BLACK_LIST.remove(tweet_id)
		
	def reload(self):
		alert = QMessageBox()
		alert.setText('This window will be reloaded!')
		alert.exec_()
		# Call the update user profile function
		self.R.pushUP(self.update_list)
		# After pressing the ok button in the alert, the window will be updated
		self.w = RecommendList(self.user, self.MODEL)
		self.w.show()
		self.hide()
	
	def goBack(self):
		self.w = LoginScreen()
		self.w.show()
		self.hide()
	
	'''
	This function return a list of tweets to show up in the table
	'''
	def getVectors(self):
		n_topic = self.R.get_countTopic()
		topics = self.R.get_topics()
		
		all_similarity_results = [] # list of dicts of N = n_topics elements
		for topic in topics:
			# get vectors by topic in the user profile
			vectors = self.R.get_vectorsUP(topic)
			#print(vectors)
			# get the mean
			mean_vector = self.R.calculateMean(vectors)
			# compute similarity
			similarity_results = self.R.computeSimilarity(mean_vector,20)
			# remove the black listed ids
			similarity_results = {k: v for k, v in similarity_results.items() if k not in GLOBAL_BLACK_LIST}
			# append to the general list
			all_similarity_results.append(similarity_results)
		# Merge results into a single dict
		results_dict = {k: v for d in all_similarity_results for k, v in d.items()}
		# Order by value
		results_dict_ordered = {k: v for k, v in sorted(results_dict.items(), key=lambda item: item[1], reverse=True)}
		# Obtain only top N ids
		top_n = 19
		top_n_dict = dict(Counter(results_dict_ordered).most_common(top_n))
		# Now order by considering the date
		top_n_dict, top_n_list = self.R.ranking(top_n_dict)
		self.keys = [k for k, v in top_n_dict.items()]
		# Obtain the 20th number from the Long-tail
		ran_value = randint(20, len(results_dict_ordered)-1)
		count = 0
		for k, v in results_dict_ordered.items():
			if(count==ran_value):
				ran_key = k
				ran_similarity = v
			count=count+1
		#tail = {ran_key: ran_similarity}
		# Retrieve Tweets
		tweets = []
		dates = []
		for i in range(0, top_n):
			self.c.execute("SELECT tweet FROM Tweets WHERE tweet_id=? ",(self.keys[i],))
			tweets.append(self.c.fetchone())
			self.c.execute("SELECT date FROM Tweets WHERE tweet_id=? ",(self.keys[i],))
			dates.append(self.c.fetchone()[0])
		
		# Get the last one from the tail
		self.c.execute("SELECT tweet FROM Tweets WHERE tweet_id=? ",(ran_key,))
		tweets.append(self.c.fetchone())
		self.c.execute("SELECT date FROM Tweets WHERE tweet_id=? ",(ran_key,))
		dates.append(self.c.fetchone()[0])
		self.keys.append(ran_key)
		return tweets, dates, top_n_list, ran_similarity
		
		
	def load_table(self):
		columns = 4
		
		rows, dates, top_n_list, tail_similarity = self.getVectors()
		rows = self.R.cleanRows(rows)
		n_rows = len(rows)
		
		self.table = QTableWidget(n_rows, columns, self)
		self.table.setHorizontalHeaderLabels(["Tweet", "Do you like it?", "Tweet date", "Similarity"])
		header = self.table.horizontalHeader()
		header.setSectionResizeMode(0, QHeaderView.ResizeToContents)
		#self.table.setResizeMode(QHeaderView.ResizeToContents)
		
		for row in rows:
			inx = rows.index(row)

			self.table.insertRow(inx)
			self.chkBoxItem = QTableWidgetItem('I Like it!')
			self.chkBoxItem.setFlags(Qt.ItemIsUserCheckable | Qt.ItemIsEnabled)
			self.chkBoxItem.setCheckState(Qt.Unchecked)
			self.chkBoxItem.setToolTip('Help to <b>Improve</b> our system selecting the items you like.')
			# Listener on the checkbox
			self.table.itemClicked.connect(self.onStateChange_check)
			
			#self.table.setItem(inx, 0, QTableWidgetItem(str(row[0].decode("utf-8"))))
			self.table.setItem(inx, 0, QTableWidgetItem(row))
			self.table.setItem(inx, 1, self.chkBoxItem)
			self.table.setItem(inx, 2, QTableWidgetItem(dates[inx]))
			if(inx == (n_rows-1)):
				self.table.setItem(inx, 3, QTableWidgetItem("{:1.5f}".format(tail_similarity)))
			else :
				#self.table.setItem(inx, 3, QTableWidgetItem(str(top_n_list[inx])))
				self.table.setItem(inx, 3, QTableWidgetItem("{:1.5f}".format(top_n_list[inx])))
		self.layout().addWidget(self.table)
			
	def closeEvent(self, event):
		reply = QMessageBox.question(self, 'Message',
		"Are you sure to quit?", QMessageBox.Yes |
		QMessageBox.No, QMessageBox.No)
		if reply == QMessageBox.Yes:
			event.accept()
		else:
			event.ignore()