def form_pun(self, eval_path): retrieve = Retrieve(sentence_path=TEXT_DATA_DIR + TEXT_DATA, pun_path=PUN_DATA_DIR + PUN_DATA) (pun, sentence, score) = retrieve.retrieve() if not sentence: print("No sentence with word {} was found. Exiting...".format( pun[1])) raise Exception() text = word_tokenize(sentence) tokenized = nltk.pos_tag(text) print(tokenized) print(sentence, pun[0], pun[1]) pre = self.tokenizer.texts_to_sequences([sentence]) wp = self.tokenizer.texts_to_sequences([pun[0]]) wa = self.tokenizer.texts_to_sequences([pun[1]]) if (not wa[0]) or (not wp[0]): print( "The pair of pun and word does not exist in the parsed corpus. Exit..." ) raise Exception() index_wa = -1 for seq in pre[0]: index_wa = index_wa + 1 if seq == wa[0][0]: pre[0][index_wa] = wp[0][0] break wordsimilarity = WordSimilarity() wordsimilarity.word2vec() wordsimilarity.load() try_limit = 5 try_count = 0 index_topic = 0 while True: try: topic_word = None for i in range(index_topic, len(tokenized)): (word, pos) = tokenized[i] if (pos == 'NNP'): topic_word = "man" print(word, pos) index_topic = index_topic + 1 break if (pos == 'NN') or (pos == 'PRP') or (pos == 'NNS') or ( pos == 'PRP$'): topic_word = word print(word, pos) index_topic = index_topic + 1 break index_topic = index_topic + 1 result = wordsimilarity.getSimilar([topic_word, pun[0]], [pun[1]], 10) other_result = wordsimilarity.getSimilar([pun[0]], [], 10) break except KeyError: print("Word {} is not in vocabulary, try with the next one". format(topic_word)) try_count = try_count + 1 if try_limit == try_count: print("Limit of trys has been reached. Exit...") raise Exception() eval_surprisal = Evaluate() eval_surprisal.load_model(eval_path) finals = [] mean_amalgam = 0 for (word, prob) in result: swap = self.tokenizer.texts_to_sequences([word]) context_window = 2 surprise = eval_surprisal.compute_surpisal( sentence=pre[0], pun_word=wa[0][0], pun_alternative=wp[0][0], context_window=context_window) mean_amalgam = mean_amalgam + surprise print(surprise) pre[0][index_topic] = swap[0][0] post_simple = self.tokenizer.sequences_to_texts([pre[0]]) print(post_simple) pre[0][index_topic + 1] = 0 if index_topic >= 2: pre[0][index_topic - 1] = 0 post_smoothing = self.dac.inference(pre[0]) post_smoothing = self.tokenizer.sequences_to_texts( post_smoothing.tolist()) finals.append(post_smoothing) print(post_smoothing) print(finals) print(mean_amalgam / 10) other_finals = [] mean_similar = 0 for (word, prob) in other_result: swap = self.tokenizer.texts_to_sequences([word]) context_window = 2 surprise = eval_surprisal.compute_surpisal( sentence=pre[0], pun_word=wa[0][0], pun_alternative=wp[0][0], context_window=context_window) mean_similar = mean_similar + surprise print(surprise) pre[0][index_topic] = swap[0][0] post_simple = self.tokenizer.sequences_to_texts([pre[0]]) print(post_simple) pre[0][index_topic + 1] = 0 if index_topic >= 2: pre[0][index_topic - 1] = 0 post_smoothing = self.dac.inference(pre[0]) post_smoothing = self.tokenizer.sequences_to_texts( post_smoothing.tolist()) other_finals.append(post_smoothing) print(post_smoothing) print(other_finals) print(mean_similar / 10) return finals.extend(other_finals)
class App: #Initializer / instance attributes def __init__(self, taxonomy, protein): self.Taxonomy = taxonomy self.Protein = protein self.ncbi_api = Retrieve() self.tools = Tools() self.dataset = None self.fasta = None self.summary = None @classmethod def from_class(cls): return cls(User_input.from_input("taxonomy"), User_input.from_input("protein")) @property def taxon_query(self): return self.Taxonomy.val @taxon_query.setter def taxon_query(self, inp): if inp != "taxonomy": self.Taxonomy = User_input.from_param(inp, "taxonomy") else: self.Taxonomy = User_input.from_input("taxonomy") @property def protein_query(self): return self.Protein.val @protein_query.setter def protein_query(self, inp): self.Protein = User_input.from_input("protein") def total_species(self): return len(self.dataset.keys()) def total_seqs(self): return sum(1 for species in self.dataset for acc in self.dataset[species]) def get_taxa(self): #given self.taxon_query, return list of return self.ncbi_api.get_taxa(self.taxon_query, "Taxonomy") def plot(self): self.tools.plot() def write(self, fasta, alt=""): if fasta: self.tools.write(fasta, self.protein_query, self.taxon_query) else: print("Missing fasta file! Please run get_fasta first.") def taxa(self, typ="all"): # gets list of all taxa produced from search self.dataset = self.ncbi_api.taxa_protein_dict(self.get_fasta(), typ) def get_summary(self): self.summary = self.ncbi_api.summary(self.protein_query, self.taxon_query) #TODO refactor to be property def get_fasta(self): #initiates ncbi search using esearch and efetch self.fasta = self.ncbi_api.retrieve(self.protein_query, self.taxon_query) return self.fasta
class main(): def __init__(self): self.BiasScrap = PolBiasScraper() self.ArticleScrap = ArticleScrapper() self.Retrie = Retrieve() self.dateTimeObj = datetime.now() self.timestamp = self.dateTimeObj.strftime("%Y-%m-%d-%H-%M") self.datasetSize = 20 self.rankedsetSize = 10 self.maxtags = 30 self.politicweight = 0.95 self.maxageseconds = 604800 self.weight = { "age": 0.12, "tags": 0.24, "polarity": 0.21, "subjectivity": 0.13, "credibility": 0.13, "bias": 0.17 } def percentage(self, article): unweighted = { "age": 0, "tags": 0, "polarity": 0, "subjectivity": 0, "credibility": 0, "bias": 0 } # age x = (article["metricscores"]["age"] / self.maxageseconds) unweighted["age"] = (x + 1)**(-5 * x) # tags if article["metricscores"]["tags"] > 1: unweighted["tags"] = self.politicweight + ( (article["metricscores"]["tags"] - 1) * (1 - self.politicweight)) else: unweighted["tags"] = (article["metricscores"]["tags"]) * ( 1 - self.politicweight) # polarity x = abs(article["metricscores"]["polarity"]) unweighted["polarity"] = x # subjectivity unweighted["subjectivity"] = article["metricscores"]["polarity"] # credibility unweighted["credibility"] = article["metricscores"]["credibility"] / 10 # bias unweighted["bias"] = article["metricscores"]["bias"] / 5 return unweighted def mediaBias(self, article): article["metrics"].update(self.ArticleScrap.parseURL(article["url"])) buildUrlFacts = "https://mediabiasfactcheck.com/" + article[ "media_name"].lower().replace(" ", "-") article["metrics"].update(self.BiasScrap.parseURL(buildUrlFacts)) def tagQuantification(self, article): metricscores = {} for x in article["metrics"]: if x == "tags": metricscores[x] = 0 if "politics and government" in article["metrics"]["tags"]: metricscores[x] += 1 if len(article["metrics"]["tags"]) < 31: metricscores[x] += len(article["metrics"]["tags"]) / 30 else: metricscores[x] += 1 else: metricscores[x] = article["metrics"][x] return metricscores def retrieveData(self): data = self.Retrie.retrieve(self.timestamp, storylimit=self.datasetSize) data = data['articles'][:self.datasetSize] unrankeddata = {} for index, article in enumerate(data): print(index + 1) self.mediaBias(article) article["metricscores"] = self.tagQuantification(article) # weights unweighted = self.percentage(article) score = 0 for metric in unweighted: score += unweighted[metric] * self.weight[metric] article["score"] = abs(score) unrankeddata[abs(score)] = article rankeddata = {} for index, article in enumerate( sorted(unrankeddata.keys(), reverse=True)): rankeddata[index + 1] = unrankeddata[article] if index == self.rankedsetSize: break with open("json/data_{0}.json".format(self.timestamp), "w") as file: json.dump(rankeddata, file) return rankeddata