예제 #1
0
    def form_pun(self, eval_path):
        retrieve = Retrieve(sentence_path=TEXT_DATA_DIR + TEXT_DATA,
                            pun_path=PUN_DATA_DIR + PUN_DATA)
        (pun, sentence, score) = retrieve.retrieve()

        if not sentence:
            print("No sentence with word {} was found. Exiting...".format(
                pun[1]))
            raise Exception()

        text = word_tokenize(sentence)
        tokenized = nltk.pos_tag(text)

        print(tokenized)
        print(sentence, pun[0], pun[1])
        pre = self.tokenizer.texts_to_sequences([sentence])
        wp = self.tokenizer.texts_to_sequences([pun[0]])
        wa = self.tokenizer.texts_to_sequences([pun[1]])

        if (not wa[0]) or (not wp[0]):
            print(
                "The pair of pun and word does not exist in the parsed corpus. Exit..."
            )
            raise Exception()

        index_wa = -1
        for seq in pre[0]:
            index_wa = index_wa + 1
            if seq == wa[0][0]:
                pre[0][index_wa] = wp[0][0]
                break

        wordsimilarity = WordSimilarity()
        wordsimilarity.word2vec()
        wordsimilarity.load()

        try_limit = 5
        try_count = 0
        index_topic = 0
        while True:
            try:
                topic_word = None
                for i in range(index_topic, len(tokenized)):
                    (word, pos) = tokenized[i]
                    if (pos == 'NNP'):
                        topic_word = "man"
                        print(word, pos)
                        index_topic = index_topic + 1
                        break

                    if (pos == 'NN') or (pos == 'PRP') or (pos == 'NNS') or (
                            pos == 'PRP$'):
                        topic_word = word
                        print(word, pos)
                        index_topic = index_topic + 1
                        break
                    index_topic = index_topic + 1

                result = wordsimilarity.getSimilar([topic_word, pun[0]],
                                                   [pun[1]], 10)
                other_result = wordsimilarity.getSimilar([pun[0]], [], 10)

                break
            except KeyError:
                print("Word {} is not in vocabulary, try with the next one".
                      format(topic_word))
                try_count = try_count + 1
                if try_limit == try_count:
                    print("Limit of trys has been reached. Exit...")
                    raise Exception()

        eval_surprisal = Evaluate()
        eval_surprisal.load_model(eval_path)

        finals = []
        mean_amalgam = 0
        for (word, prob) in result:
            swap = self.tokenizer.texts_to_sequences([word])

            context_window = 2
            surprise = eval_surprisal.compute_surpisal(
                sentence=pre[0],
                pun_word=wa[0][0],
                pun_alternative=wp[0][0],
                context_window=context_window)
            mean_amalgam = mean_amalgam + surprise
            print(surprise)

            pre[0][index_topic] = swap[0][0]

            post_simple = self.tokenizer.sequences_to_texts([pre[0]])
            print(post_simple)

            pre[0][index_topic + 1] = 0
            if index_topic >= 2:
                pre[0][index_topic - 1] = 0

            post_smoothing = self.dac.inference(pre[0])
            post_smoothing = self.tokenizer.sequences_to_texts(
                post_smoothing.tolist())
            finals.append(post_smoothing)
            print(post_smoothing)
        print(finals)
        print(mean_amalgam / 10)

        other_finals = []
        mean_similar = 0
        for (word, prob) in other_result:
            swap = self.tokenizer.texts_to_sequences([word])

            context_window = 2
            surprise = eval_surprisal.compute_surpisal(
                sentence=pre[0],
                pun_word=wa[0][0],
                pun_alternative=wp[0][0],
                context_window=context_window)
            mean_similar = mean_similar + surprise
            print(surprise)

            pre[0][index_topic] = swap[0][0]

            post_simple = self.tokenizer.sequences_to_texts([pre[0]])
            print(post_simple)

            pre[0][index_topic + 1] = 0
            if index_topic >= 2:
                pre[0][index_topic - 1] = 0

            post_smoothing = self.dac.inference(pre[0])
            post_smoothing = self.tokenizer.sequences_to_texts(
                post_smoothing.tolist())
            other_finals.append(post_smoothing)
            print(post_smoothing)
        print(other_finals)
        print(mean_similar / 10)

        return finals.extend(other_finals)
예제 #2
0
class App:
    #Initializer / instance attributes
    def __init__(self, taxonomy, protein):
        self.Taxonomy = taxonomy
        self.Protein = protein
        self.ncbi_api = Retrieve()
        self.tools = Tools()
        self.dataset = None
        self.fasta = None
        self.summary = None

    @classmethod
    def from_class(cls):
        return cls(User_input.from_input("taxonomy"),
                   User_input.from_input("protein"))

    @property
    def taxon_query(self):
        return self.Taxonomy.val

    @taxon_query.setter
    def taxon_query(self, inp):
        if inp != "taxonomy":
            self.Taxonomy = User_input.from_param(inp, "taxonomy")
        else:
            self.Taxonomy = User_input.from_input("taxonomy")

    @property
    def protein_query(self):
        return self.Protein.val

    @protein_query.setter
    def protein_query(self, inp):
        self.Protein = User_input.from_input("protein")

    def total_species(self):
        return len(self.dataset.keys())

    def total_seqs(self):
        return sum(1 for species in self.dataset
                   for acc in self.dataset[species])

    def get_taxa(self):
        #given self.taxon_query, return list of
        return self.ncbi_api.get_taxa(self.taxon_query, "Taxonomy")

    def plot(self):
        self.tools.plot()

    def write(self, fasta, alt=""):
        if fasta:
            self.tools.write(fasta, self.protein_query, self.taxon_query)
        else:
            print("Missing fasta file! Please run get_fasta first.")

    def taxa(self, typ="all"):
        #	gets list of all taxa produced from search
        self.dataset = self.ncbi_api.taxa_protein_dict(self.get_fasta(), typ)

    def get_summary(self):
        self.summary = self.ncbi_api.summary(self.protein_query,
                                             self.taxon_query)

    #TODO refactor to be property
    def get_fasta(self):
        #initiates ncbi search using esearch and efetch
        self.fasta = self.ncbi_api.retrieve(self.protein_query,
                                            self.taxon_query)
        return self.fasta
예제 #3
0
class main():
    def __init__(self):
        self.BiasScrap = PolBiasScraper()
        self.ArticleScrap = ArticleScrapper()
        self.Retrie = Retrieve()
        self.dateTimeObj = datetime.now()
        self.timestamp = self.dateTimeObj.strftime("%Y-%m-%d-%H-%M")
        self.datasetSize = 20
        self.rankedsetSize = 10
        self.maxtags = 30
        self.politicweight = 0.95
        self.maxageseconds = 604800

        self.weight = {
            "age": 0.12,
            "tags": 0.24,
            "polarity": 0.21,
            "subjectivity": 0.13,
            "credibility": 0.13,
            "bias": 0.17
        }

    def percentage(self, article):
        unweighted = {
            "age": 0,
            "tags": 0,
            "polarity": 0,
            "subjectivity": 0,
            "credibility": 0,
            "bias": 0
        }

        # age
        x = (article["metricscores"]["age"] / self.maxageseconds)
        unweighted["age"] = (x + 1)**(-5 * x)

        # tags
        if article["metricscores"]["tags"] > 1:
            unweighted["tags"] = self.politicweight + (
                (article["metricscores"]["tags"] - 1) *
                (1 - self.politicweight))
        else:
            unweighted["tags"] = (article["metricscores"]["tags"]) * (
                1 - self.politicweight)

        # polarity
        x = abs(article["metricscores"]["polarity"])
        unweighted["polarity"] = x

        # subjectivity
        unweighted["subjectivity"] = article["metricscores"]["polarity"]

        # credibility
        unweighted["credibility"] = article["metricscores"]["credibility"] / 10

        # bias
        unweighted["bias"] = article["metricscores"]["bias"] / 5

        return unweighted

    def mediaBias(self, article):
        article["metrics"].update(self.ArticleScrap.parseURL(article["url"]))
        buildUrlFacts = "https://mediabiasfactcheck.com/" + article[
            "media_name"].lower().replace(" ", "-")
        article["metrics"].update(self.BiasScrap.parseURL(buildUrlFacts))

    def tagQuantification(self, article):
        metricscores = {}
        for x in article["metrics"]:
            if x == "tags":
                metricscores[x] = 0
                if "politics and government" in article["metrics"]["tags"]:
                    metricscores[x] += 1
                if len(article["metrics"]["tags"]) < 31:
                    metricscores[x] += len(article["metrics"]["tags"]) / 30
                else:
                    metricscores[x] += 1
            else:
                metricscores[x] = article["metrics"][x]
        return metricscores

    def retrieveData(self):
        data = self.Retrie.retrieve(self.timestamp,
                                    storylimit=self.datasetSize)
        data = data['articles'][:self.datasetSize]

        unrankeddata = {}

        for index, article in enumerate(data):
            print(index + 1)

            self.mediaBias(article)

            article["metricscores"] = self.tagQuantification(article)

            # weights
            unweighted = self.percentage(article)

            score = 0
            for metric in unweighted:
                score += unweighted[metric] * self.weight[metric]

            article["score"] = abs(score)
            unrankeddata[abs(score)] = article

        rankeddata = {}

        for index, article in enumerate(
                sorted(unrankeddata.keys(), reverse=True)):
            rankeddata[index + 1] = unrankeddata[article]
            if index == self.rankedsetSize:
                break
        with open("json/data_{0}.json".format(self.timestamp), "w") as file:
            json.dump(rankeddata, file)

        return rankeddata