import gensim.downloader as api from gensim import matutils from nltk.tokenize import word_tokenize import numpy as np from quasimodo.assertion_output.closest_indexes import ClosestIndexes from quasimodo.assertion_output.saliency_and_typicality_computation_submodule import get_raw_predicate from quasimodo.parameters_reader import ParametersReader from quasimodo.data_structures.submodule_interface import SubmoduleInterface from quasimodo.assertion_output.tsv_output_submodule import get_version SLICE_SIZE = 2000 # Change this in case of Memory problems TOPK = 1000 N = 75 parameters_reader = ParametersReader() OUT_DIR = parameters_reader.get_parameter( "out-dir") or os.path.dirname(__file__) + "/out/" def save_tsv_triples(triples): version = get_version() save_file = OUT_DIR + "quasimodo" + str(version) + ".tsv" with open(save_file, "w") as f: f.write("\t".join([ "subject", "predicate", "object", "modality", "is_negative", "score", "sentences source", "neighborhood sigma", "local sigma" ]) + "\n") f.write("\n".join(triples))
import os from quasimodo.assertion_generation.question_file_submodule import QuestionFileSubmodule from quasimodo.parameters_reader import ParametersReader parameters_reader = ParametersReader() FILENAME = parameters_reader.get_parameter("yahoo-questions") or \ os.path.dirname(__file__) + "/data/questions-yahoo.txt" class YahooQuestionsSubmodule(QuestionFileSubmodule): """ Yahoo Questions are obtained by downloading them from \ https://webscope.sandbox.yahoo.com/. More precisely: * L5 - Yahoo! Answers Manner Questions, version 2.0 * L6 - Yahoo! Answers Comprehensive Questions and Answers version 1.0 Then, we run: grep "<subject>" FullOct2007.xml | sed -i "s/<subject>//g" | \ sed -i "s/<\/subject>//g" | \ grep -i "^\(why\|how\) \(do\|does\|can\|cannot\|are\|is\)" > questions.txt """ def __init__(self, module_reference): super().__init__(module_reference) self._filename = FILENAME self._name = "Yahoo Questions"
from quasimodo.parameters_reader import ParametersReader from quasimodo.assertion_validation.association_submodule import AssociationSubmodule import os.path import pickle parameters_reader = ParametersReader() filename = parameters_reader.get_parameter("imagetag-associations") or "" CACHE_IMAGETAG = "cache_imagetag.pck" class ImagetagSubmodule(AssociationSubmodule): def __init__(self, module_reference): super().__init__(module_reference) self._module_reference = module_reference self._name = "Image Tag submodule" def _get_associations(self, subjects): associations = dict() if os.path.isfile(CACHE_IMAGETAG): return pickle.load(open(CACHE_IMAGETAG, "rb")) with open(filename) as f: for line in f: line = line.strip().lower().split("\t") for word0 in line: if word0 not in associations: associations[word0] = dict() assos_word0 = associations[word0] for word1 in line: if word0 == word1: continue
from quasimodo.default_workflow import DefaultWorkflow from quasimodo.parameters_reader import ParametersReader import logging import socket name = str(socket.gethostname()) parameters_reader = ParametersReader() PATTERN_FIRST = (parameters_reader.get_parameter("pattern-first") or "true") == "true" if PATTERN_FIRST: name += "by_pattern" logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M', filename='log_' + name + '.txt', filemode='a') if __name__ == '__main__': # Configure logging logging.basicConfig(filename="log.txt", level=logging.DEBUG) logger = logging.getLogger(__name__) # Create a workflow workflow = DefaultWorkflow() print("Choose the stage at which you should begin the processing") workflow.print_index() stage = int(input()) workflow.run_from(save=True, idx_from=stage)
import logging import os from quasimodo.parts_of_facts import PartsOfFacts from quasimodo.data_structures.submodule_interface import SubmoduleInterface from quasimodo.assertion_fusion.trainer import Trainer from quasimodo.parameters_reader import ParametersReader save_weights = True parameters_reader = ParametersReader() annotations_file = parameters_reader.get_parameter("annotations-file") or "data/training_active_learning.tsv" save_file = parameters_reader.get_parameter("weights-file") or os.path.dirname(__file__) + "/../temp/weights.tsv" def _save_weights(parts_of_facts): annotations = get_annotated_data() header = parts_of_facts.get_header() header.append("label") save = ["\t".join(header)] for fact in parts_of_facts.get_all_facts(): row = parts_of_facts.get_fact_row(fact) row.append(annotations.get((fact.get_subject().get(), fact.get_predicate().get(), fact.get_object().get(), str(int(fact.is_negative()))), -1)) row = [str(x) for x in row] save.append("\t".join(row))
import os import re from collections import Counter import numpy as np from quasimodo.parameters_reader import ParametersReader from quasimodo.data_structures.submodule_interface import SubmoduleInterface import logging parameters_reader = ParametersReader() OUT_DIR = parameters_reader.get_parameter("out-dir") or os.path.dirname(__file__) + "/../out/" ANIMALS_FILENAME = parameters_reader.get_parameter("animals50") or os.path.dirname(__file__) + "/../data/animals_50.txt" OCCUPATIONS_FILENAME = parameters_reader.get_parameter("occupations50") or \ os.path.dirname(__file__) + "/../data/occupations_50.txt" def get_version(): version = 1 regex_output = re.compile(r"quasimodo(?P<version>\d+).tsv") for file in os.listdir(OUT_DIR): match = regex_output.match(file) if match is not None: version = max(version, int(match.group("version"))) return version class StatisticsSubmodule(SubmoduleInterface): def __init__(self, module_reference): super().__init__() self._module_reference = module_reference
import os from quasimodo.seeds.subject_file_submodule import SubjectFileSubmodule from quasimodo.parameters_reader import ParametersReader parameters_reader = ParametersReader() FILENAME = parameters_reader.get_parameter("occupations-subjects") or \ os.path.dirname(__file__) + "/data/occupations_50.txt" class OccupationsSubmodule(SubjectFileSubmodule): def __init__(self, module_reference): self._module_reference = module_reference self._name = "Occupation Seeds" self._filename = FILENAME
import os from quasimodo.assertion_generation.question_file_submodule import QuestionFileSubmodule from quasimodo.parameters_reader import ParametersReader parameters_reader = ParametersReader() FILENAME = parameters_reader.get_parameter("answercom-questions") or \ os.path.dirname(__file__) + "/data/questions-answerscom-filtered.txt" class AnswerscomQuestionsSubmodule(QuestionFileSubmodule): def __init__(self, module_reference): super().__init__(module_reference) # The questions are obtained from the website answers.com self._filename = FILENAME self._name = "Answers.com Questions"
import logging import os from quasimodo.spacy_accessor import get_default_annotator from quasimodo.data_structures.submodule_interface import SubmoduleInterface from quasimodo.parameters_reader import ParametersReader parameters_reader = ParametersReader() path_to_properties = parameters_reader.get_parameter("properties-dir") or "" class AreTransformationSubmodule(SubmoduleInterface): def __init__(self, module_reference): super().__init__() self._module_reference = module_reference self._name = "Are transformation" def process(self, input_interface): logging.info("Start the are predicate transformation") gfs = input_interface.get_generated_facts() conversion = dict() for filename in os.listdir(path_to_properties): name = "has_" + filename.replace(".txt", "") with open(path_to_properties + filename) as f: for line in f: line = line.strip() conversion[line] = name new_gfs = [] for gf in gfs:
import requests from quasimodo.cache.cachable_querying_system import CachableQueryingSystem from quasimodo.cache.mongodb_cache import MongoDBCache from quasimodo.parameters_reader import ParametersReader headers = {'User-agent': 'Mozilla/5.0'} # baseurl = "http://clients1.google.com/complete/search?" baseurl = "http://google.com/complete/search?" RELOADTIME = 600 # Look for new sentences? look_new = True parameters_reader = ParametersReader() DEFAULT_MONGODB_LOCATION = parameters_reader.get_parameter("default-mongodb-location") or "mongodb://localhost:27017/" SERVER_URL = (parameters_reader.get_parameter("server-url") or "http://localhost:5000/").strip("/") GET_URL = SERVER_URL + "/get_query" POST_URL = SERVER_URL + "/add_new" HEADERS_JSON = {'content-type': 'application/json'} logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) print("DEFAULT MONGODB LOCATION", DEFAULT_MONGODB_LOCATION) class GoogleAutocompleteClient(CachableQueryingSystem): """SubmoduleGoogleAutocomplete A submodule for the google autocomplete triple generation
from quasimodo.parameters_reader import ParametersReader from quasimodo.assertion_validation.sentence_comparator import SentenceComparator parameters_reader = ParametersReader() WHAT_QUESTION_FILE = parameters_reader.get_parameter( "what-questions-file") or "" class WhatQuestionsComparatorSubmodule(SentenceComparator): def __init__(self, module_reference): super().__init__(module_reference, WHAT_QUESTION_FILE) self._name = "What questions file"
import os from quasimodo.assertion_generation.question_file_submodule import QuestionFileSubmodule from quasimodo.parameters_reader import ParametersReader parameters_reader = ParametersReader() FILENAME = parameters_reader.get_parameter("reddit-questions") or \ os.path.dirname(__file__) + "/data/questions-reddit.txt" class RedditQuestionsSubmodule(QuestionFileSubmodule): def __init__(self, module_reference): super().__init__(module_reference) # Reddit questions are obtained from a dump of Reddit self._filename = FILENAME self._name = "Reddit Questions"
import os import spacy import language_check import logging import time from subprocess import call from quasimodo.inflect_accessor import DEFAULT_INFLECT from quasimodo.parameters_reader import ParametersReader parameters_reader = ParametersReader() CACHE_DIR = parameters_reader.get_parameter("question-cache-dir") or \ os.path.dirname(__file__) + "/question2statement/" _tool = language_check.LanguageTool('en-US') _nlp = spacy.load('en_core_web_lg') TEXT = 0 POS = 1 NEGATE_VERB = [ "am", "is", "are", "was", "were", "do", "does", "did" "should", "must", "would", "may", "have", "has", "might", "shall", "will", "could" ] def _correct_tokens(tokens, pos): merge_next = False res_tokens = [] res_pos = []
import logging import os from quasimodo.data_structures.multiple_scores import MultipleScore from quasimodo.data_structures.multiple_source_occurrence import MultipleSourceOccurrence from quasimodo.data_structures.submodule_interface import SubmoduleInterface from quasimodo.data_structures.generated_fact import GeneratedFact from quasimodo.parameters_reader import ParametersReader parameters_reader = ParametersReader() FILENAME = parameters_reader.get_parameter("stats-snippets") or \ os.path.dirname(__file__) + "/data/stats_animal_occupations_snippets.tsv" class ArchitSubmodule(SubmoduleInterface): def __init__(self, module_reference): super().__init__() self._module_reference = module_reference self._name = "Archit submodule" # To redefine self._index = -1 # column of the feature def process(self, input_interface): logging.info("Start the " + self._name + " archit submodule") first = True spos = set() for gf in input_interface.get_generated_facts(): spos.add((gf.get_subject().get(), gf.get_predicate().get(), gf.get_object().get())) new_gfs = [] with open(FILENAME) as f: for line in f:
from urllib.parse import quote import http.client, json from quasimodo.cache.cachable_querying_system import CachableQueryingSystem from quasimodo.cache.mongodb_cache import MongoDBCache from quasimodo.parameters_reader import ParametersReader from quasimodo.assertion_generation.browser_autocomplete_submodule import BrowserAutocompleteSubmodule import logging import time parameters_reader = ParametersReader() DEFAULT_MONGODB_LOCATION = parameters_reader.get_parameter( "default-mongodb-location") or "mongodb://localhost:27017/" subscriptionKey = parameters_reader.get_parameter("bing-key") or "" OK = 200 # Location of the api host = 'api.cognitive.microsoft.com' path = '/bing/v7.0/suggestions' # language mkt = 'en-US' def get_response(query, lang): params = '?mkt=' + lang + '&q=' + quote(query) headers = {'Ocp-Apim-Subscription-Key': subscriptionKey} conn = http.client.HTTPSConnection(host) conn.request("GET", path + params, None, headers)
FORBIDDEN_BEFORE_SUBJECT = ["a", "the", "an"] STATEMENT = 0 SCORE = 1 PATTERN = 2 SUBJECT = 3 NEGATIVITY = 4 QUESTION = 5 _nlp = spacy.load('en_core_web_sm') reference_corenlp = SubmoduleReferenceInterface("CoreNLP") reference_openie5 = SubmoduleReferenceInterface("OpenIE5") reference_manual = SubmoduleReferenceInterface("Manual") parameters_reader = ParametersReader() MEMORY_CORENLP = parameters_reader.get_parameter("memory-corenlp") or "10G" CACHE_CORENLP_FILENAME = parameters_reader.get_parameter("cache-corenlp") or \ os.path.dirname(__file__) + "/data/cache_corenlp.tsv" def _simple_extraction(sentence): tokens = [] for token in _nlp(sentence): tokens.append(token.text.lower()) if "can" in tokens: idx_can = tokens.index("can") if tokens[0] == "not": return [ ' '.join(tokens[1:idx_can]), "can", " ".join(tokens[idx_can + 1:]), True
import logging from quasimodo.parameters_reader import ParametersReader from quasimodo.data_structures.submodule_interface import SubmoduleInterface from quasimodo.data_structures.fact import Fact parameters_reader = ParametersReader() SEEDS_LOCATION = parameters_reader.get_parameter("conceptnet-seeds") or "" class ConceptNetSeedsSubmodule(SubmoduleInterface): def __init__(self, module_reference): super().__init__() self._module_reference = module_reference self._name = "ConceptNet Seeds" def process(self, input_interface): logging.info("Start ConceptNet Seeds gathering") facts = [] with open(SEEDS_LOCATION) as f: for line in f: line = line.strip() spo = line.split("\t") facts.append(Fact(spo[0], spo[1], spo[2])) logging.info("%d seeds from ConceptNet where loaded", len(facts)) return input_interface.add_seeds(facts)
import os import os.path import logging import pickle import time from quasimodo.parameters_reader import ParametersReader parameters_reader = ParametersReader() filename = parameters_reader.get_parameter("openie-file") or None filename_no_found = parameters_reader.get_parameter("openie-file-no-found") or\ os.path.dirname(__file__) + "/data/no_found_openie_sentences.txt" CACHE_OPENIE_READER = "cache_openie_reader.pck" class OpenIEReader(object): def __init__(self): self.sentence_to_fact = dict() if filename is not None: self.initialize_from_filename() def initialize_from_filename(self): if os.path.isfile(CACHE_OPENIE_READER): self.sentence_to_fact = pickle.load(open(CACHE_OPENIE_READER, "rb")) return with open(filename) as f: temp = [] sentence = '' for line in f:
from quasimodo.parameters_reader import ParametersReader from quasimodo.assertion_validation.sentence_comparator import SentenceComparator parameters_reader = ParametersReader() CONCEPTUAL_CAPTION_FILE = parameters_reader.get_parameter( "conceptual-caption-file") or "" class ConceptualCaptionsComparatorSubmodule(SentenceComparator): def __init__(self, module_reference): super().__init__(module_reference, CONCEPTUAL_CAPTION_FILE) self._name = "Conceptual Caption"
import os import logging import time import re from bs4 import BeautifulSoup import apiclient from quasimodo.cache.mongodb_cache import MongoDBCache from quasimodo.inflect_accessor import DEFAULT_INFLECT from quasimodo.parameters_reader import ParametersReader from quasimodo.data_structures.submodule_interface import SubmoduleInterface parameters_reader = ParametersReader() api_key = parameters_reader.get_parameter("google-book-key") or "" DEFAULT_MONGODB_LOCATION = parameters_reader.get_parameter( "default-mongodb-location") or "mongodb://localhost:27017/" try: service = apiclient.discovery.build('books', 'v1', developerKey=api_key) except Exception as e: logging.warning("When initializing Google Book: " + str(e)) service = None cache_dir = os.path.dirname(__file__) + "/googlebook-cache/" cache_file = cache_dir + "cache.tsv" calls_per_seconds = 1 class GoogleBookSubmodule(SubmoduleInterface):
from quasimodo.parameters_reader import ParametersReader from quasimodo.assertion_validation.association_submodule import AssociationSubmodule import flickr_api import logging import os parameters_reader = ParametersReader() filename = parameters_reader.get_parameter("flickr-clusters") or "" class FlickrClustersSubmodule(AssociationSubmodule): def __init__(self, module_reference): super().__init__(module_reference) self._module_reference = module_reference self._name = "Flickr" def _get_clusters(self, subject): clusters = [] try: clusters = flickr_api.Tag.getClusters(tag=subject) except flickr_api.flickrerrors.FlickrAPIError: logging.info(subject + " has no cluster") except TypeError: logging.info("Problem of type with " + subject) except Exception as e: logging.info("Unknown error " + str(e)) res = [] for cluster in clusters: temp = [] for tag in cluster.tags: temp.append(tag.text.lower())
import os from quasimodo.seeds.subject_file_submodule import SubjectFileSubmodule from quasimodo.parameters_reader import ParametersReader parameters_reader = ParametersReader() FILENAME = parameters_reader.get_parameter("conceptnet-subjects") or \ os.path.dirname(__file__) + "/data/conceptnet_subjects.txt" class ConceptnetSubjectsSubmodule(SubjectFileSubmodule): def __init__(self, module_reference): super().__init__(module_reference) self._module_reference = module_reference self._name = "Conceptnet Subject Seeds" self._filename = FILENAME
import os from quasimodo.seeds.subject_file_submodule import SubjectFileSubmodule from quasimodo.parameters_reader import ParametersReader parameters_reader = ParametersReader() FILENAME = parameters_reader.get_parameter("animal-subjects") or \ os.path.dirname(__file__) + "/data/anitemp.txt" class AnimalSubmodule(SubjectFileSubmodule): """AnimalSubmodule A submodule to produce animals of the subjects of the input """ def __init__(self, module_reference): super().__init__(module_reference) self._module_reference = module_reference self._name = "Animal Seeds" self._filename = FILENAME
import json import logging import time from urllib.parse import quote import requests from quasimodo.cache.cachable_querying_system import CachableQueryingSystem from quasimodo.cache.mongodb_cache import MongoDBCache from quasimodo.parameters_reader import ParametersReader from quasimodo.assertion_generation.browser_autocomplete_submodule import BrowserAutocompleteSubmodule parameters_reader = ParametersReader() PATTERN_FIRST = (parameters_reader.get_parameter("pattern-first") or "true") == "true" headers = {'User-agent': 'Mozilla/5.0'} # baseurl = "http://clients1.google.com/complete/search?" baseurl = "http://google.com/complete/search?" RELOADTIME = 60 # Look for new sentences? look_new = not PATTERN_FIRST DEFAULT_MONGODB_LOCATION = parameters_reader.get_parameter( "default-mongodb-location") or "mongodb://localhost:27017/" class GoogleAutocompleteSubmodule(BrowserAutocompleteSubmodule, CachableQueryingSystem): """SubmoduleGoogleAutocomplete