示例#1
0
import gensim.downloader as api
from gensim import matutils
from nltk.tokenize import word_tokenize
import numpy as np

from quasimodo.assertion_output.closest_indexes import ClosestIndexes
from quasimodo.assertion_output.saliency_and_typicality_computation_submodule import get_raw_predicate
from quasimodo.parameters_reader import ParametersReader
from quasimodo.data_structures.submodule_interface import SubmoduleInterface
from quasimodo.assertion_output.tsv_output_submodule import get_version

SLICE_SIZE = 2000  # Change this in case of Memory problems
TOPK = 1000
N = 75

parameters_reader = ParametersReader()
OUT_DIR = parameters_reader.get_parameter(
    "out-dir") or os.path.dirname(__file__) + "/out/"


def save_tsv_triples(triples):
    version = get_version()
    save_file = OUT_DIR + "quasimodo" + str(version) + ".tsv"
    with open(save_file, "w") as f:
        f.write("\t".join([
            "subject", "predicate", "object", "modality", "is_negative",
            "score", "sentences source", "neighborhood sigma", "local sigma"
        ]) + "\n")
        f.write("\n".join(triples))

示例#2
0
import os

from quasimodo.assertion_generation.question_file_submodule import QuestionFileSubmodule
from quasimodo.parameters_reader import ParametersReader

parameters_reader = ParametersReader()
FILENAME = parameters_reader.get_parameter("yahoo-questions") or \
        os.path.dirname(__file__) + "/data/questions-yahoo.txt"


class YahooQuestionsSubmodule(QuestionFileSubmodule):
    """ Yahoo Questions are obtained by downloading them from \
        https://webscope.sandbox.yahoo.com/. More precisely:
        * L5 - Yahoo! Answers Manner Questions, version 2.0
        * L6 - Yahoo! Answers Comprehensive Questions and Answers version 1.0

        Then, we run:
            grep "<subject>" FullOct2007.xml | sed -i "s/<subject>//g" | \
                sed -i "s/<\/subject>//g" | \
                grep -i "^\(why\|how\) \(do\|does\|can\|cannot\|are\|is\)" > questions.txt
    """
    def __init__(self, module_reference):
        super().__init__(module_reference)
        self._filename = FILENAME
        self._name = "Yahoo Questions"
示例#3
0
from quasimodo.parameters_reader import ParametersReader
from quasimodo.assertion_validation.association_submodule import AssociationSubmodule
import os.path
import pickle

parameters_reader = ParametersReader()
filename = parameters_reader.get_parameter("imagetag-associations") or ""

CACHE_IMAGETAG = "cache_imagetag.pck"


class ImagetagSubmodule(AssociationSubmodule):
    def __init__(self, module_reference):
        super().__init__(module_reference)
        self._module_reference = module_reference
        self._name = "Image Tag submodule"

    def _get_associations(self, subjects):
        associations = dict()
        if os.path.isfile(CACHE_IMAGETAG):
            return pickle.load(open(CACHE_IMAGETAG, "rb"))
        with open(filename) as f:
            for line in f:
                line = line.strip().lower().split("\t")
                for word0 in line:
                    if word0 not in associations:
                        associations[word0] = dict()
                    assos_word0 = associations[word0]
                    for word1 in line:
                        if word0 == word1:
                            continue
示例#4
0
from quasimodo.default_workflow import DefaultWorkflow
from quasimodo.parameters_reader import ParametersReader
import logging
import socket

name = str(socket.gethostname())

parameters_reader = ParametersReader()
PATTERN_FIRST = (parameters_reader.get_parameter("pattern-first")
                 or "true") == "true"
if PATTERN_FIRST:
    name += "by_pattern"

logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
    datefmt='%m-%d %H:%M',
    filename='log_' + name + '.txt',
    filemode='a')

if __name__ == '__main__':
    # Configure logging
    logging.basicConfig(filename="log.txt", level=logging.DEBUG)
    logger = logging.getLogger(__name__)
    # Create a workflow
    workflow = DefaultWorkflow()
    print("Choose the stage at which you should begin the processing")
    workflow.print_index()
    stage = int(input())
    workflow.run_from(save=True, idx_from=stage)
import logging
import os

from quasimodo.parts_of_facts import PartsOfFacts
from quasimodo.data_structures.submodule_interface import SubmoduleInterface
from quasimodo.assertion_fusion.trainer import Trainer
from quasimodo.parameters_reader import ParametersReader


save_weights = True


parameters_reader = ParametersReader()
annotations_file = parameters_reader.get_parameter("annotations-file") or "data/training_active_learning.tsv"
save_file = parameters_reader.get_parameter("weights-file") or os.path.dirname(__file__) + "/../temp/weights.tsv"


def _save_weights(parts_of_facts):
    annotations = get_annotated_data()
    header = parts_of_facts.get_header()
    header.append("label")
    save = ["\t".join(header)]
    for fact in parts_of_facts.get_all_facts():
        row = parts_of_facts.get_fact_row(fact)
        row.append(annotations.get((fact.get_subject().get(),
                                    fact.get_predicate().get(),
                                    fact.get_object().get(),
                                    str(int(fact.is_negative()))),
                                   -1))
        row = [str(x) for x in row]
        save.append("\t".join(row))
示例#6
0
import os
import re
from collections import Counter
import numpy as np

from quasimodo.parameters_reader import ParametersReader
from quasimodo.data_structures.submodule_interface import SubmoduleInterface
import logging

parameters_reader = ParametersReader()
OUT_DIR = parameters_reader.get_parameter("out-dir") or os.path.dirname(__file__) + "/../out/"
ANIMALS_FILENAME = parameters_reader.get_parameter("animals50") or os.path.dirname(__file__) + "/../data/animals_50.txt"
OCCUPATIONS_FILENAME = parameters_reader.get_parameter("occupations50") or \
                       os.path.dirname(__file__) + "/../data/occupations_50.txt"


def get_version():
    version = 1
    regex_output = re.compile(r"quasimodo(?P<version>\d+).tsv")
    for file in os.listdir(OUT_DIR):
        match = regex_output.match(file)
        if match is not None:
            version = max(version, int(match.group("version")))
    return version


class StatisticsSubmodule(SubmoduleInterface):

    def __init__(self, module_reference):
        super().__init__()
        self._module_reference = module_reference
示例#7
0
import os
from quasimodo.seeds.subject_file_submodule import SubjectFileSubmodule
from quasimodo.parameters_reader import ParametersReader


parameters_reader = ParametersReader()
FILENAME = parameters_reader.get_parameter("occupations-subjects") or \
        os.path.dirname(__file__) + "/data/occupations_50.txt"


class OccupationsSubmodule(SubjectFileSubmodule):

    def __init__(self, module_reference):
        self._module_reference = module_reference
        self._name = "Occupation Seeds"
        self._filename = FILENAME
import os

from quasimodo.assertion_generation.question_file_submodule import QuestionFileSubmodule
from quasimodo.parameters_reader import ParametersReader

parameters_reader = ParametersReader()
FILENAME = parameters_reader.get_parameter("answercom-questions") or \
        os.path.dirname(__file__) + "/data/questions-answerscom-filtered.txt"


class AnswerscomQuestionsSubmodule(QuestionFileSubmodule):
    def __init__(self, module_reference):
        super().__init__(module_reference)
        # The questions are obtained from the website answers.com
        self._filename = FILENAME
        self._name = "Answers.com Questions"
import logging
import os

from quasimodo.spacy_accessor import get_default_annotator
from quasimodo.data_structures.submodule_interface import SubmoduleInterface
from quasimodo.parameters_reader import ParametersReader


parameters_reader = ParametersReader()
path_to_properties = parameters_reader.get_parameter("properties-dir") or ""


class AreTransformationSubmodule(SubmoduleInterface):

    def __init__(self, module_reference):
        super().__init__()
        self._module_reference = module_reference
        self._name = "Are transformation"

    def process(self, input_interface):
        logging.info("Start the are predicate transformation")
        gfs = input_interface.get_generated_facts()
        conversion = dict()
        for filename in os.listdir(path_to_properties):
            name = "has_" + filename.replace(".txt", "")
            with open(path_to_properties + filename) as f:
                for line in f:
                    line = line.strip()
                    conversion[line] = name
        new_gfs = []
        for gf in gfs:
示例#10
0
import requests

from quasimodo.cache.cachable_querying_system import CachableQueryingSystem
from quasimodo.cache.mongodb_cache import MongoDBCache
from quasimodo.parameters_reader import ParametersReader


headers = {'User-agent': 'Mozilla/5.0'}
# baseurl = "http://clients1.google.com/complete/search?"
baseurl = "http://google.com/complete/search?"
RELOADTIME = 600

# Look for new sentences?
look_new = True

parameters_reader = ParametersReader()
DEFAULT_MONGODB_LOCATION = parameters_reader.get_parameter("default-mongodb-location") or "mongodb://localhost:27017/"
SERVER_URL = (parameters_reader.get_parameter("server-url") or "http://localhost:5000/").strip("/")
GET_URL = SERVER_URL + "/get_query"
POST_URL = SERVER_URL + "/add_new"
HEADERS_JSON = {'content-type': 'application/json'}


logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

print("DEFAULT MONGODB LOCATION", DEFAULT_MONGODB_LOCATION)


class GoogleAutocompleteClient(CachableQueryingSystem):
    """SubmoduleGoogleAutocomplete
    A submodule for the google autocomplete triple generation
示例#11
0
from quasimodo.parameters_reader import ParametersReader
from quasimodo.assertion_validation.sentence_comparator import SentenceComparator

parameters_reader = ParametersReader()
WHAT_QUESTION_FILE = parameters_reader.get_parameter(
    "what-questions-file") or ""


class WhatQuestionsComparatorSubmodule(SentenceComparator):
    def __init__(self, module_reference):
        super().__init__(module_reference, WHAT_QUESTION_FILE)
        self._name = "What questions file"
示例#12
0
import os

from quasimodo.assertion_generation.question_file_submodule import QuestionFileSubmodule
from quasimodo.parameters_reader import ParametersReader


parameters_reader = ParametersReader()
FILENAME = parameters_reader.get_parameter("reddit-questions") or \
        os.path.dirname(__file__) + "/data/questions-reddit.txt"



class RedditQuestionsSubmodule(QuestionFileSubmodule):

    def __init__(self, module_reference):
        super().__init__(module_reference)
        # Reddit questions are obtained from a dump of Reddit
        self._filename = FILENAME
        self._name = "Reddit Questions"
示例#13
0
import os
import spacy
import language_check
import logging
import time
from subprocess import call

from quasimodo.inflect_accessor import DEFAULT_INFLECT
from quasimodo.parameters_reader import ParametersReader

parameters_reader = ParametersReader()
CACHE_DIR = parameters_reader.get_parameter("question-cache-dir") or \
            os.path.dirname(__file__) + "/question2statement/"

_tool = language_check.LanguageTool('en-US')
_nlp = spacy.load('en_core_web_lg')

TEXT = 0
POS = 1

NEGATE_VERB = [
    "am", "is", "are", "was", "were", "do", "does", "did"
    "should", "must", "would", "may", "have", "has", "might", "shall", "will",
    "could"
]


def _correct_tokens(tokens, pos):
    merge_next = False
    res_tokens = []
    res_pos = []
示例#14
0
import logging
import os

from quasimodo.data_structures.multiple_scores import MultipleScore
from quasimodo.data_structures.multiple_source_occurrence import MultipleSourceOccurrence
from quasimodo.data_structures.submodule_interface import SubmoduleInterface
from quasimodo.data_structures.generated_fact import GeneratedFact
from quasimodo.parameters_reader import ParametersReader

parameters_reader = ParametersReader()
FILENAME = parameters_reader.get_parameter("stats-snippets") or \
        os.path.dirname(__file__) + "/data/stats_animal_occupations_snippets.tsv"


class ArchitSubmodule(SubmoduleInterface):
    def __init__(self, module_reference):
        super().__init__()
        self._module_reference = module_reference
        self._name = "Archit submodule"  # To redefine
        self._index = -1  # column of the feature

    def process(self, input_interface):
        logging.info("Start the " + self._name + " archit submodule")
        first = True
        spos = set()
        for gf in input_interface.get_generated_facts():
            spos.add((gf.get_subject().get(), gf.get_predicate().get(),
                      gf.get_object().get()))
        new_gfs = []
        with open(FILENAME) as f:
            for line in f:
from urllib.parse import quote
import http.client, json

from quasimodo.cache.cachable_querying_system import CachableQueryingSystem
from quasimodo.cache.mongodb_cache import MongoDBCache
from quasimodo.parameters_reader import ParametersReader
from quasimodo.assertion_generation.browser_autocomplete_submodule import BrowserAutocompleteSubmodule
import logging
import time

parameters_reader = ParametersReader()

DEFAULT_MONGODB_LOCATION = parameters_reader.get_parameter(
    "default-mongodb-location") or "mongodb://localhost:27017/"
subscriptionKey = parameters_reader.get_parameter("bing-key") or ""

OK = 200

# Location of the api
host = 'api.cognitive.microsoft.com'
path = '/bing/v7.0/suggestions'

# language
mkt = 'en-US'


def get_response(query, lang):
    params = '?mkt=' + lang + '&q=' + quote(query)
    headers = {'Ocp-Apim-Subscription-Key': subscriptionKey}
    conn = http.client.HTTPSConnection(host)
    conn.request("GET", path + params, None, headers)
FORBIDDEN_BEFORE_SUBJECT = ["a", "the", "an"]

STATEMENT = 0
SCORE = 1
PATTERN = 2
SUBJECT = 3
NEGATIVITY = 4
QUESTION = 5

_nlp = spacy.load('en_core_web_sm')

reference_corenlp = SubmoduleReferenceInterface("CoreNLP")
reference_openie5 = SubmoduleReferenceInterface("OpenIE5")
reference_manual = SubmoduleReferenceInterface("Manual")

parameters_reader = ParametersReader()
MEMORY_CORENLP = parameters_reader.get_parameter("memory-corenlp") or "10G"
CACHE_CORENLP_FILENAME = parameters_reader.get_parameter("cache-corenlp") or \
                         os.path.dirname(__file__) + "/data/cache_corenlp.tsv"


def _simple_extraction(sentence):
    tokens = []
    for token in _nlp(sentence):
        tokens.append(token.text.lower())
    if "can" in tokens:
        idx_can = tokens.index("can")
        if tokens[0] == "not":
            return [
                ' '.join(tokens[1:idx_can]), "can",
                " ".join(tokens[idx_can + 1:]), True
示例#17
0
import logging

from quasimodo.parameters_reader import ParametersReader
from quasimodo.data_structures.submodule_interface import SubmoduleInterface
from quasimodo.data_structures.fact import Fact

parameters_reader = ParametersReader()
SEEDS_LOCATION = parameters_reader.get_parameter("conceptnet-seeds") or ""


class ConceptNetSeedsSubmodule(SubmoduleInterface):
    def __init__(self, module_reference):
        super().__init__()
        self._module_reference = module_reference
        self._name = "ConceptNet Seeds"

    def process(self, input_interface):
        logging.info("Start ConceptNet Seeds gathering")
        facts = []
        with open(SEEDS_LOCATION) as f:
            for line in f:
                line = line.strip()
                spo = line.split("\t")
                facts.append(Fact(spo[0], spo[1], spo[2]))
        logging.info("%d seeds from ConceptNet where loaded", len(facts))
        return input_interface.add_seeds(facts)
示例#18
0
import os
import os.path
import logging
import pickle
import time

from quasimodo.parameters_reader import ParametersReader

parameters_reader = ParametersReader()
filename = parameters_reader.get_parameter("openie-file") or None
filename_no_found = parameters_reader.get_parameter("openie-file-no-found") or\
        os.path.dirname(__file__) + "/data/no_found_openie_sentences.txt"

CACHE_OPENIE_READER = "cache_openie_reader.pck"


class OpenIEReader(object):
    def __init__(self):
        self.sentence_to_fact = dict()
        if filename is not None:
            self.initialize_from_filename()

    def initialize_from_filename(self):
        if os.path.isfile(CACHE_OPENIE_READER):
            self.sentence_to_fact = pickle.load(open(CACHE_OPENIE_READER,
                                                     "rb"))
            return
        with open(filename) as f:
            temp = []
            sentence = ''
            for line in f:
from quasimodo.parameters_reader import ParametersReader
from quasimodo.assertion_validation.sentence_comparator import SentenceComparator

parameters_reader = ParametersReader()
CONCEPTUAL_CAPTION_FILE = parameters_reader.get_parameter(
    "conceptual-caption-file") or ""


class ConceptualCaptionsComparatorSubmodule(SentenceComparator):
    def __init__(self, module_reference):
        super().__init__(module_reference, CONCEPTUAL_CAPTION_FILE)
        self._name = "Conceptual Caption"
示例#20
0
import os
import logging
import time
import re
from bs4 import BeautifulSoup

import apiclient

from quasimodo.cache.mongodb_cache import MongoDBCache
from quasimodo.inflect_accessor import DEFAULT_INFLECT
from quasimodo.parameters_reader import ParametersReader
from quasimodo.data_structures.submodule_interface import SubmoduleInterface

parameters_reader = ParametersReader()
api_key = parameters_reader.get_parameter("google-book-key") or ""
DEFAULT_MONGODB_LOCATION = parameters_reader.get_parameter(
    "default-mongodb-location") or "mongodb://localhost:27017/"

try:
    service = apiclient.discovery.build('books', 'v1', developerKey=api_key)
except Exception as e:
    logging.warning("When initializing Google Book: " + str(e))
    service = None

cache_dir = os.path.dirname(__file__) + "/googlebook-cache/"
cache_file = cache_dir + "cache.tsv"

calls_per_seconds = 1


class GoogleBookSubmodule(SubmoduleInterface):
示例#21
0
from quasimodo.parameters_reader import ParametersReader
from quasimodo.assertion_validation.association_submodule import AssociationSubmodule
import flickr_api
import logging
import os

parameters_reader = ParametersReader()
filename = parameters_reader.get_parameter("flickr-clusters") or ""


class FlickrClustersSubmodule(AssociationSubmodule):
    def __init__(self, module_reference):
        super().__init__(module_reference)
        self._module_reference = module_reference
        self._name = "Flickr"

    def _get_clusters(self, subject):
        clusters = []
        try:
            clusters = flickr_api.Tag.getClusters(tag=subject)
        except flickr_api.flickrerrors.FlickrAPIError:
            logging.info(subject + " has no cluster")
        except TypeError:
            logging.info("Problem of type with " + subject)
        except Exception as e:
            logging.info("Unknown error " + str(e))
        res = []
        for cluster in clusters:
            temp = []
            for tag in cluster.tags:
                temp.append(tag.text.lower())
示例#22
0
import os

from quasimodo.seeds.subject_file_submodule import SubjectFileSubmodule
from quasimodo.parameters_reader import ParametersReader

parameters_reader = ParametersReader()
FILENAME = parameters_reader.get_parameter("conceptnet-subjects") or \
        os.path.dirname(__file__) + "/data/conceptnet_subjects.txt"


class ConceptnetSubjectsSubmodule(SubjectFileSubmodule):
    def __init__(self, module_reference):
        super().__init__(module_reference)
        self._module_reference = module_reference
        self._name = "Conceptnet Subject Seeds"
        self._filename = FILENAME
示例#23
0
import os
from quasimodo.seeds.subject_file_submodule import SubjectFileSubmodule
from quasimodo.parameters_reader import ParametersReader

parameters_reader = ParametersReader()
FILENAME = parameters_reader.get_parameter("animal-subjects") or \
        os.path.dirname(__file__) + "/data/anitemp.txt"


class AnimalSubmodule(SubjectFileSubmodule):
    """AnimalSubmodule
    A submodule to produce animals of the subjects of the input
    """
    def __init__(self, module_reference):
        super().__init__(module_reference)
        self._module_reference = module_reference
        self._name = "Animal Seeds"
        self._filename = FILENAME
import json
import logging
import time
from urllib.parse import quote

import requests

from quasimodo.cache.cachable_querying_system import CachableQueryingSystem
from quasimodo.cache.mongodb_cache import MongoDBCache
from quasimodo.parameters_reader import ParametersReader
from quasimodo.assertion_generation.browser_autocomplete_submodule import BrowserAutocompleteSubmodule

parameters_reader = ParametersReader()
PATTERN_FIRST = (parameters_reader.get_parameter("pattern-first")
                 or "true") == "true"

headers = {'User-agent': 'Mozilla/5.0'}
# baseurl = "http://clients1.google.com/complete/search?"
baseurl = "http://google.com/complete/search?"
RELOADTIME = 60

# Look for new sentences?
look_new = not PATTERN_FIRST

DEFAULT_MONGODB_LOCATION = parameters_reader.get_parameter(
    "default-mongodb-location") or "mongodb://localhost:27017/"


class GoogleAutocompleteSubmodule(BrowserAutocompleteSubmodule,
                                  CachableQueryingSystem):
    """SubmoduleGoogleAutocomplete