Exemplo n.º 1
0
def ImportDB(workspace):

    importScript = "/Users/amitrou/Documents/CodeProjects/historical_data/restore-db.sh"
    if not os.path.exists(importScript):
        raise Exception("Script doesn't exist")
        return

    settingsPath = os.path.join(workspace, "import_settings.json")
    if not os.path.exists(settingsPath):
        raise Exception("Configuration file doen't exist")
        return

    import_settings = Utilities.ReadJSON(settingsPath)

    if import_settings == None:
        raise Exception("Error")

    server = import_settings['host']
    port = import_settings['port']
    user = import_settings['user']
    dump_list = import_settings["dump_list"]

    if len(dump_list) == 0:
        raise Exception("No contents")

    for item in dump_list:
        dump_file = item["dupm_file"]
        db_name = item["db_name"]

        if (dump_file != None) and (db_name != None):
            Utilities.ExecuteShellScript(importScript, server, port, user,
                                         db_name, dump_file)
Exemplo n.º 2
0
    def __heartbeat_cb(self, timer):
        self.__heartbeat_counter += 1

        if self.__heartbeat_counter >= Settings.MQTT_KEEPALIVE:
            try:
                self.__client.publish(
                    b'{}/ping'.format(Settings.MQTT_USERNAME), b'ping')
                self.__heartbeat_counter = 0
            except OSError as ose:
                err_msg = str(ose)

                print("err time:", time())
                print(err_msg)

                if err_msg in ("[Errno 104] ECONNRESET", "-1"):
                    try:
                        self.__client.disconnect()
                    except OSError:
                        pass
                    finally:
                        self.__client.connect()
                elif err_msg == "[Errno 113] EHOSTUNREACH":
                    Utilities.hard_reset()

        gc.collect()
Exemplo n.º 3
0
 def __init__(self):
     self.data_file = app_config['data_file']
     self.texts_in_file = 'texts_in_file.txt'
     self.ner_texts_file = 'output.txt'
     self.utilities = Utilities()
     self.lemmatizer = WordNetLemmatizer()
     self.preprocessor = Preprocessor(
         ['remove_urls', 'remove_mentions', 'remove_hashtags', 'normalize'])
Exemplo n.º 4
0
    def __init__(self):
        self.path_to_jar = 'lib/stanford_parser/stanford-parser.jar'
        self.path_to_models_jar = 'lib/stanford_parser/stanford-english-corenlp-2018-02-27-models.jar'
        self.path_to_ner_tagger = 'lib/stanford_ner/stanford-ner.jar'
        self.path_to_ner_model = 'lib/stanford_ner/english.all.3class.distsim.crf.ser.gz'

        self.ner_tagger = StanfordNERTagger(self.path_to_ner_model,
                                            self.path_to_ner_tagger)
        self.dependency_parser = StanfordDependencyParser(
            path_to_jar=self.path_to_jar,
            path_to_models_jar=self.path_to_models_jar)
        self.lemmatizer = WordNetLemmatizer()
        self.utilities = Utilities()
    def getEHRClasses(patientXML, children=True, parents=True, duplicates=False):

        if ( duplicates ):

            ehrClasses = Utilities.getXMLElements(patientXML, {}, children, parents, duplicates);
            allValues = [];
            for depth in ehrClasses: allValues += ehrClasses[depth];
            return allValues;

        else:

            # Combines all values in dictionary of EHR depths.
            return [element.tag for element in set(set().union(*list(Utilities.getXMLElements(patientXML, {}, children, parents, duplicates).values())))];
 def centroidError():
     centroid1 = Body25.getCentroid(keypoint1)
     centroid2 = Body25.getCentroid(keypoint2)
     dist = Ut.distance(centroid1 / frame, centroid2 / frame, ignore_zero_vector=True)
     if np.isnan(dist):
         return np.inf
     return dist
    def getEHRClassChildren(patientXML, ehrClass, children=True, parents=False, allEHRChildren=False, contextualiseChildren=True, removeGerunds=True):

        ehrClassChildren = {};

        for ehrClassExample in patientXML.findall(".//" + ehrClass):

            ehrClassExampleDepthsToChildren = Utilities.getXMLElements(ehrClassExample, {}, children, parents, False, True, True);

            if 0 in list(ehrClassExampleDepthsToChildren.keys()):

                for element in ehrClassExampleDepthsToChildren[0]:

                    # Contextualise those EHR children that do not give enough context on their own, because they are just generic children.
                    if ( contextualiseChildren and element.tag.lower() in TranslationConstants.FIELDS_THAT_INDICATE_RESOURCE_CAN_HOLD_ANY_DATA ):

                        # Work out how to present this new compound child (child + parent name), based on which separators are used by this EHR.
                        if ( TranslationConstants.SEPARATOR != "" ):
                            element.tag = ehrClass + TranslationConstants.SEPARATOR + element.tag;

                        else:
                            element.tag = ehrClass[0].upper() + ehrClass[1:] + element.tag;

                    ehrClassChildren.setdefault(ehrClass, []).extend([element.tag]);

                    # If an EHR word begins with a gerund (such as 'Managing' in 'ManagingOrganisation'), this potentially complicates the context of the word, and so should be accounted for. Remove gerunds AND add the gerund free version as an additional EHR child.
                    if ( removeGerunds ): ehrClassChildren.setdefault(ehrClass, []).extend([TranslationUtilities.removeGerund(element.tag)]);


            # As we may have multiple examples of an EHR class in an example piece of marked up data from an EHR vendor, we want to find all possible examples of children that can be listed under that class.
            if ( not allEHRChildren ): break;

        return ehrClassChildren;
    def getFHIRClassesToChildren(fhirClasses=TranslationUtilities.getFHIRClasses(), linkedClasses=True, fhirClassesRecurse=False, selectiveRecurse=TranslationConstants.SELECTIVE_RECURSE, includesBackboneElements=True, mergeMainChildrenWithBackboneChildren=True):

        fhirClassesToChildren = {};

        if (includesBackboneElements and mergeMainChildrenWithBackboneChildren):

            for fhirClassAndBackboneElements in fhirClasses:

                fhirClass = fhirClassAndBackboneElements[0];

                for fhirClassOrBackboneElement in fhirClassAndBackboneElements:

                    children = TranslationUtilities.getFHIRClassChildren(fhirClassOrBackboneElement, linkedClasses, fhirClassesRecurse, selectiveRecurse);

                    if ( children != None ): fhirClassesToChildren.setdefault(fhirClass, []).extend(children)

        else:

            if ( not mergeMainChildrenWithBackboneChildren ): fhirClasses = Utilities.mergeListOfLists(fhirClasses);

            for fhirClass in fhirClasses:

                children = TranslationUtilities.getFHIRClassChildren(fhirClass, linkedClasses, fhirClassesRecurse, selectiveRecurse);

                if ( children != None ): fhirClassesToChildren[fhirClass] = children;

        return fhirClassesToChildren;
    def removeGerund(ehrChild):

        separatedElementTag = Utilities.listFromCapitals(ehrChild);

        if ( len(separatedElementTag) > 1 ):

            taggedSeperatedElementTag = nltk.pos_tag(separatedElementTag);
            separatedElementTag = [tag[0] for tag in taggedSeperatedElementTag if "VBG" not in tag[1]]
            return "".join(separatedElementTag);

        return ehrChild;
    def morphologicalSimilarity(ehrAttribute, fhirAttribute, lemmaSimilarityThreshold=TranslationConstants.MORPHOLOGICAL_SIMILARITY_THRESHOLD):

        if SimilarityMetrics.textMatch(ehrAttribute, fhirAttribute): return 1;

        highestSimilarity = 0;

        for lemma in Utilities.lemmas(ehrAttribute):

            if SimilarityMetrics.textSimilarity(lemma, fhirAttribute, True) > highestSimilarity and SimilarityMetrics.textMatch(lemma, fhirAttribute, True, lemmaSimilarityThreshold):
                highestSimilarity = SimilarityMetrics.textSimilarity(lemma, fhirAttribute, True);

        return highestSimilarity;
    def compositeStringSimilarity(ehrClassField, fhirClassField, comparisonMethod, comparisonMethodArgs=[], highestResult=True, removeStopwords=True):

        if ( comparisonMethod(ehrClassField, fhirClassField, *comparisonMethodArgs) == 1 ): return 1;

        # If ehrClass string is composite, compare each word with the FHIR target using all of the metrics, and then use chosen combination method to produce a value, e.g. for each word, add these values, and then divide by number of words to get an average match across all words or return highest.
        highestSimilarity = 0;
        highestSimilarityWord = "";

        totalSimilarity = 0;

        ehrWords = Utilities.listFromCapitals(ehrClassField);
        fhirWords = Utilities.listFromCapitals(fhirClassField);

        if (removeStopwords): ehrWords = [word for word in ehrWords if word.lower() not in stopwords.words('english')];

        for ehrWord in ehrWords:

            highestSimilarityForEHRWord = 0;

            for fhirWord in fhirWords:

                similarity = comparisonMethod(ehrWord, fhirWord, *comparisonMethodArgs);

                if ( similarity > highestSimilarity ):

                    highestSimilarity = similarity;
                    highestSimilarityWord = ehrWord;

                if ( similarity > highestSimilarityForEHRWord ): highestSimilarityForEHRWord = similarity;

            totalSimilarity += highestSimilarityForEHRWord;

        if ( highestResult and len(highestSimilarityWord) > TranslationConstants.LENGTH_TO_IGNORE_IN_COMPOSITE_HIGHEST ):

            return highestSimilarity;

        else:

            return old_div(totalSimilarity, max(float(len(ehrWords)), float(len(fhirWords))));
Exemplo n.º 12
0
 def __init__(self, params=list()):
     self.remove_urls = True if 'remove_urls' in params else False
     self.remove_mentions = True if 'remove_mentions' in params else False
     self.remove_hashtags = True if 'remove_hashtags' in params else False
     self.normalize = True if 'normalize' in params else False
     self.remove_stopwords = True if 'remove_stopwords' in params else False
     self.remove_punct = True if 'remove_punctuation' in params else False
     self.lower = True if 'lower' in params else False
     self.lemmatize = True if 'lemmatize' in params else False
     self.stemming = True if 'stemming' in params else False
     self.remove_non_letters = True if 'remove_non_letters' in params else False
     self.lemmatizer = WordNetLemmatizer()
     self.stemmer = PorterStemmer()
     self.utilities = Utilities()
Exemplo n.º 13
0
    def get_evaluation_data(self, dataset_file, n_pair):
        utilities = Utilities()

        preprocessor = Preprocessor(
            ['remove_stopwords', 'remove_non_letters', 'lemmatize'])

        data_rows = utilities.read_from_csv(dataset_file)
        del data_rows[0]

        X = []
        y = []
        for data_row in data_rows[:n_pair]:
            candidate_causal_pair = eval(data_row[2])
            label = 1 if data_row[3] == 'causal' else 0

            candidate_causal_phrase = preprocessor.preprocess(
                candidate_causal_pair[0])
            candidate_effect_phrase = preprocessor.preprocess(
                candidate_causal_pair[1])
            if len(candidate_causal_phrase) > 0 and len(
                    candidate_effect_phrase) > 0:
                X.append((candidate_causal_pair[0], candidate_causal_pair[1]))
                y.append(label)
        return X, y
 def distanceError():
     sum_dist = 0
     num = 0
     for part in Body25.Parts:
         coord1 = Body25.getCoordinates(keypoint1, part)
         coord2 = Body25.getCoordinates(keypoint2, part)
         dist = Ut.distance(coord1 / frame, coord2 / frame, ignore_zero_vector=True)
         if np.isnan(dist):
             continue
         else:
             sum_dist += dist
         num += 1
     if num == 0:
         return np.inf
     else:
         return sum_dist / num
Exemplo n.º 15
0
    def test_TestMorphological(self):

        # e.g. self.assertTrue(Matches.match("PostCode", "postalCode"));

        total = 0
        matched = 0
        for key, value in usToGB.items():
            lemmas = list(Utilities.lemmas(value))
            if (lemmas):
                total += 1
                shuffle(lemmas)
                if (Matches.matches(value, lemmas[0])):
                    matched += 1
                else:
                    print(str(value) + " " + str(lemmas[0]))
        matchPercentage = matched / float(total)
        self.assertTrue(matchPercentage > 0.90)
    def run_script(browser='chrome'):
        try:
            driver = Utilities.create_webdriver_instance(browser=browser)
            driver.get(WEBSITE_URL)

            TaskFour.select_random_video_on_homepage(driver)
            TaskFour.wait_for_ad_to_complete(driver)
            TaskFour.move_progress_bar_to_some_position(driver)

        except Exception:
            # TODO: Remove broad exceptions
            # TODO: Add logging
            # TODO: Take screenshot
            traceback.print_exc()

        finally:
            driver.quit()
            print('Script Complete')
Exemplo n.º 17
0
    def run_script(browser='chrome'):
        try:
            driver = Utilities.create_webdriver_instance(browser=browser)

            TaskTwo.search_for_flights(driver)
            TaskTwo.wait_for_flight_results_page_to_load(driver)
            TaskTwo.select_first_view_deal(driver)
            TaskTwo.select_second_view_deal(driver)

        except Exception:
            # TODO: Remove broad exceptions
            # TODO: Add logging
            # TODO: Take screenshot
            traceback.print_exc()

        finally:
            driver.quit()
            print('Script Complete')
Exemplo n.º 18
0
    def __data_timer_cb(self, timer):
        value = self.get_temperature()

        print("current temperature: {} ℃".format(value))

        try:
            self.__publish_data(value)
        except OSError as ose:
            err_msg = str(ose)

            if err_msg == "-1":
                pass
            elif err_msg == "[Errno 113] EHOSTUNREACH":
                Utilities.hard_reset()
            else:
                Utilities.log(self.__data_timer_cb, err_msg,
                              self.__log_callback)
        except Exception as e:
            err_msg = str(e)
            Utilities.log(self.__data_timer_cb, err_msg, self.__log_callback)
Exemplo n.º 19
0
    def __msg_timer_cb(self):
        while self.__starting:
            try:
                self.__mqtt_client.wait_msg()
            except OSError as ose:
                err_msg = str(ose)

                if err_msg == "-1":
                    pass
                elif err_msg == "[Errno 113] EHOSTUNREACH":
                    Utilities.hard_reset()
                else:
                    Utilities.log(self.__msg_timer_cb, err_msg,
                                  self.__log_callback)
                    # raise OSError(err_msg)
            except Exception as e:
                err_msg = str(e)
                Utilities.log(self.__msg_timer_cb, err_msg,
                              self.__log_callback)

            gc.collect()
Exemplo n.º 20
0
def do_job(articles, tokens, causal_net_generator):
    causal_pair_tokens = causal_net_generator.get_all_causal_pair_tokens(
        articles)

    tokens += causal_pair_tokens


if __name__ == '__main__':
    start = time.time()
    print("\nJob started at %s" %
          datetime.fromtimestamp(start).strftime('%Y-%m-%d %H:%M:%S'))
    causal_net_generator = CausalNetGenerator()
    causal_net_generator_from_news = CausalNetGeneratorFromNews()
    multi_word_causal_net_generator_from_news = MultiWordCausalNetGeneratorFromNews(
    )
    utilities = Utilities()
    manager = Manager()

    ## Generate causal net from wikipedia articles

    tokens = manager.list()
    num_threads = cpu_count() - 1
    number = 1000000
    offset = 0
    print("Number: %d and offset %d" % (number, offset))

    graph_path = 'causal_net.pickle'

    articles = causal_net_generator.get_articles(number=number, offset=offset)
    dispatch_jobs(articles, num_threads, tokens, causal_net_generator)
    def semanticSimilarity(ehrAttribute, fhirAttribute, useDefinition=False, alsoUseMorphologicalSimilarity=False, morphologicalSimilarityThreshold=TranslationConstants.MORPHOLOGICAL_SIMILARITY_THRESHOLD, compositeSynonyms=False, highestResult=True ):

        # If these attributes would be associated via a text match instead, then don't also reevaluate their similarity via the text similarity below.
        if SimilarityMetrics.textMatch(ehrAttribute, fhirAttribute, False): return 0;

        highestSimilarity = 0;

        # wordnet requires word separation by underscore, whereas EHR XML responses (for TPP at least) use camelCase (this won't be an issue if used with composite string similarity, where only one word is used at a time).
        for set in wordnet.synsets(Utilities.capitalToSeparation(ehrAttribute)):

            synonyms = set.lemma_names();

            if useDefinition:

                setType = set.pos();
                associatedSynonyms = [];

                if ( set not in SimilarityMetrics.synsetToDefinitionTerms ):

                    # We also include words from the definition of this word, that are of the same grammatical type (e.g. noun or verb), as potential synonyms.
                    for word in set.definition().split(" "):

                        if ( len(word) <= 3 or word in associatedSynonyms or "." in word ): continue;

                        if ( word not in SimilarityMetrics.wordsToTypes ):

                            wordSynset = wordnet.synsets(word);

                            if not len(wordSynset): continue;

                            # Find most popular interpretation of this word, so can find right grammatical form.
                            chosenSynset = wordSynset[0];
                            highestLemmaPopularity = 0;

                            for set in wordSynset:

                                for lemma in set.lemmas():

                                    if lemma.count() > highestLemmaPopularity:
                                        highestLemmaPopularity = lemma.count();
                                        chosenSynset = set;

                            SimilarityMetrics.wordsToTypes[word] = chosenSynset.pos();

                        if ( SimilarityMetrics.wordsToTypes[word] == setType ):

                            associatedSynonyms.append(word);

                    SimilarityMetrics.synsetToDefinitionTerms[set] = associatedSynonyms;

                synonyms = synonyms + SimilarityMetrics.synsetToDefinitionTerms[set];

            for synonym in synonyms:

                # Do we want the highest value across all components of the synonym, or just the synonym directy.
                if ( compositeSynonyms ):

                    textSimilarity = SimilarityMetrics.compositeStringSimilarity(Utilities.separationToCapital(synonym), fhirAttribute, SimilarityMetrics.textSimilarity, [], highestResult);

                else:

                    textSimilarity = SimilarityMetrics.textSimilarity(Utilities.separationToCapital(synonym), fhirAttribute);

                # Synonyms may also be grammatical variants as opposed to just text matches.
                if ( alsoUseMorphologicalSimilarity ):

                    if ( compositeSynonyms ):

                        morphologicalSimilarity = SimilarityMetrics.compositeStringSimilarity(Utilities.separationToCapital(synonym), fhirAttribute, SimilarityMetrics.morphologicalSimilarity, [morphologicalSimilarityThreshold], highestResult);

                    else:

                        morphologicalSimilarity = SimilarityMetrics.morphologicalSimilarity(synoynm, fhirAttribute);

                else:
                    morphologicalSimilarity = 0;

                # Get similarity between synonym for ehrAttribute and fhirAttribute (not synonyms that are the ehr attribute itself). If this is over a given threshold, AND it is greater than previously marked highest values, update highest similarity.
                if not SimilarityMetrics.textSimilarity(synonym, ehrAttribute) == 1.0 and max(textSimilarity, morphologicalSimilarity) > highestSimilarity:

                    highestSimilarity = max(textSimilarity, morphologicalSimilarity);

        return highestSimilarity;
Exemplo n.º 22
0
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from nltk.corpus import wordnet

from utils.utilities import Utilities
from causality_detection.causal_stength_calculator import CausalStrengthCalculator
from preprocessing.preprocesssor import Preprocessor

if __name__ == '__main__':
    causal_strength_calculator = CausalStrengthCalculator()

    utilities = Utilities()
    preprocessor = Preprocessor(
        ['remove_stopwords', 'remove_non_letters', 'lemmatize'])
    dataset_file = 'causal_pairs_dataset_old.csv'

    data_rows = utilities.read_from_csv(dataset_file)
    del data_rows[0]
    X = []
    y_true = []
    y_pred = []
    threshold = 10

    for data_row in data_rows[:10]:
        candidate_causal_pair = eval(data_row[2])
        label = 1 if data_row[3] == 'causal' else 0

        candidate_causal_phrase = preprocessor.preprocess(
            candidate_causal_pair[0])
        candidate_effect_phrase = preprocessor.preprocess(
            candidate_causal_pair[1])
        if len(candidate_causal_phrase) > 0 and len(
Exemplo n.º 23
0
class EventExtractor:
    def __init__(self):
        self.data_file = app_config['data_file']
        self.texts_in_file = 'texts_in_file.txt'
        self.ner_texts_file = 'output.txt'
        self.utilities = Utilities()
        self.lemmatizer = WordNetLemmatizer()
        self.preprocessor = Preprocessor(
            ['remove_urls', 'remove_mentions', 'remove_hashtags', 'normalize'])

        # jar_files = os.path.join(os.path.dirname(__file__), 'jars')
        # self.sutime = SUTime(jars=jar_files, mark_time_ranges=True)

    def save_texts_in_file(self):
        items = self.utilities.read_from_csv(self.data_file)

        header = items[0]

        texts = [item[header.index('text')] for item in items[1:]]

        processed_texts = [
            self.preprocessor.preprocess(text).encode('utf8') for text in texts
        ]

        self.utilities.save_list_as_text_file(processed_texts,
                                              self.texts_in_file)

    def prepare_phrases(self,
                        matches,
                        tag,
                        token_position=0,
                        tag_position=-1,
                        splitter='/'):
        phrases = []
        phrase = ''
        for match in matches:
            match_components = match.split(splitter)
            text_token = match_components[token_position].lower().strip()
            event_tag = match_components[tag_position]

            if event_tag == 'B-' + tag and len(phrase) < 1:
                phrase += text_token
            elif event_tag == 'B-' + tag and len(phrase) > 0:
                phrases.append(phrase)
                phrase = text_token
            else:
                phrase += ' ' + text_token
        phrases.append(phrase)
        phrases = list(set(phrases))

        return phrases

    def get_event_phrases(self, text):
        tag_name = 'EVENT'
        matches = re.findall(r'\w+/O/[A-Z]+/[BI]-' + tag_name, text)
        phrases = self.prepare_phrases(matches, tag_name)
        joined_text = ', '.join(phrases) if len(phrases) > 0 else ''

        return joined_text

    def get_event_locations(self, text):
        tag_name = 'geo-loc'
        matches = re.findall(r'\w+/[BI]-' + tag_name + '/[A-Z]+/O', text)
        phrases = self.prepare_phrases(matches=matches,
                                       tag=tag_name,
                                       token_position=0,
                                       tag_position=1)
        joined_text = ', '.join(phrases) if len(phrases) > 0 else ''

        return joined_text

    def get_event_entities(self, text):
        tag_names = [
            'person', 'company', 'facility', 'product', 'band', 'sportsteam',
            'movie', 'tv-show'
        ]
        phrases = []
        for tag_name in tag_names:
            matches = re.findall(r'\w+/[BI]-' + tag_name + '/[A-Z]+/O', text)
            if len(matches) > 0:
                phrases += self.prepare_phrases(matches=matches,
                                                tag=tag_name,
                                                token_position=0,
                                                tag_position=1)

        joined_text = ', '.join(phrases) if len(phrases) > 0 else ''

        return joined_text

    def extract_events(self):
        data_rows = self.utilities.read_from_csv(self.data_file)
        text_rows = self.utilities.read_lines_from_file(self.ner_texts_file)

        header = data_rows[0]
        del data_rows[0]
        events = []
        unique_texts = []
        for data_row, text_row in zip(data_rows, text_rows):
            text = self.preprocessor.preprocess(data_row[header.index('text')])

            if text in unique_texts:
                continue

            event = {
                'tweet_id': data_row[header.index('id')],
                'entities': self.get_event_entities(text_row),
                'locations': self.get_event_locations(text_row),
                'event_time': data_row[header.index('created_at')],
                'event_phrases': self.get_event_phrases(text_row),
            }

            events.append(event)
            unique_texts.append(text)

        return events

    def extract_events_from_stanford_dependencies(self, dependencies,
                                                  ner_tags):
        entity_categories = ['PERSON', 'LOCATION', 'ORGANIZATION']
        raw_events = {}
        for dependency in dependencies:
            if len(dependency) == 3:
                head = dependency[0]
                relation = dependency[1]
                tail = dependency[2]

                if head[1].startswith('VB'):
                    event_keywords = list(raw_events.keys())
                    event_keyword = self.lemmatizer.lemmatize(
                        head[0].lower(), 'v')
                    if event_keyword not in event_keywords:
                        raw_events[event_keyword] = {}

                    if relation.endswith('subj'):
                        subject_pronoun = [
                            'i', 'you', 'he', 'she', 'we', 'they', 'who'
                        ]
                        subj_value = self.lemmatizer.lemmatize(tail[0].lower())

                        if tail[0].lower() in subject_pronoun:
                            subj_value = 'PERSON'
                        else:
                            for ner_tag in ner_tags:
                                if ner_tag[0] == tail[0] and ner_tag[
                                        1] in entity_categories:
                                    subj_value = ner_tag[1]
                        raw_events[event_keyword]['subj'] = subj_value

                    if relation == 'dobj':
                        objective_pronoun = [
                            'me', 'you', 'him', 'her', 'us', 'you', 'them'
                        ]
                        dobj_value = self.lemmatizer.lemmatize(tail[0].lower())

                        if tail[0].lower() in objective_pronoun:
                            dobj_value = 'PERSON'
                        else:
                            for ner_tag in ner_tags:
                                if ner_tag[0] == tail[0] and ner_tag[
                                        1] in entity_categories:
                                    dobj_value = ner_tag[1]

                        raw_events[event_keyword]['dobj'] = dobj_value

                    if relation == 'compound:prt':
                        raw_events[event_keyword]['prt'] = tail[0]

        events = []
        for verb in list(raw_events.keys()):
            event = raw_events[verb]
            if len(verb) < 2 or 'subj' not in list(event.keys()) or len(event['subj']) < 2 \
                    or 'dobj' not in list(event.keys()) or len(event['dobj']) < 2:
                continue

            event['keyword'] = verb
            events.append(event)

        return events

    def get_unique_tweets(self, n_rows=None):
        data_rows = self.utilities.read_from_csv(self.data_file)
        preprocessor = Preprocessor([
            'remove_urls', 'remove_mentions', 'remove_hashtags', 'normalize',
            'remove_non_letters'
        ])

        header = data_rows[0]
        del data_rows[0]
        tweet_rows = {}
        for data_row in data_rows:
            if n_rows is not None and len(tweet_rows) >= n_rows:
                break
            tweet = preprocessor.preprocess(data_row[header.index('text')])
            if tweet not in list(tweet_rows.keys()):
                tweet_rows[tweet] = data_row
        tweet_rows = [header] + list(tweet_rows.values())

        return tweet_rows

    def get_tweet_sentences(self, tweet_rows):
        header = tweet_rows[0]
        del tweet_rows[0]

        tweet_sentences = []
        for tweet_row in tweet_rows:
            created_at = tweet_row[header.index('created_at')]
            text = self.preprocessor.preprocess(
                tweet_row[header.index('text')])
            sentences = sent_tokenize(text)
            for sentence in sentences:
                if len(sentence) > 1:
                    tweet_sentences.append((created_at, sentence))

        return tweet_sentences

    def extract_events2(self, tweet_sentences):
        path_to_jar = 'lib/stanford_parser/stanford-parser.jar'
        path_to_models_jar = 'lib/stanford_parser/stanford-english-corenlp-2018-02-27-models.jar'
        path_to_ner_tagger = 'lib/stanford_ner/stanford-ner.jar'
        path_to_ner_model = 'lib/stanford_ner/english.all.3class.distsim.crf.ser.gz'

        sentence_preprocessor = Preprocessor(['remove_non_letters'])
        ner_tagger = StanfordNERTagger(path_to_ner_model, path_to_ner_tagger)
        dependency_parser = StanfordDependencyParser(
            path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

        events = []

        chunks = list(
            self.utilities.chunkify_list(data_list=tweet_sentences,
                                         items_per_chunk=1000))

        for chunk in chunks:
            created_ats = []
            sentences = []
            for chunk_item in chunk:
                created_ats.append(chunk_item[0])
                sentences.append(
                    sentence_preprocessor.preprocess(chunk_item[1]))

            chunk_sent_dependencies = dependency_parser.raw_parse_sents(
                sentences)
            chunk_sent_ner_tags = ner_tagger.tag_sents(
                [sentence.split() for sentence in sentences])

            for sent_dependencies, sent_ner_tags, created_at in zip(
                    chunk_sent_dependencies, chunk_sent_ner_tags, created_ats):
                dependencies = [
                    list(parse.triples()) for parse in sent_dependencies
                ]

                if len(dependencies) > 0 and dependencies[0] is not None:
                    sentence_events = self.extract_events_from_stanford_dependencies(
                        dependencies[0], sent_ner_tags)
                    if len(sentence_events) > 0:
                        for sentence_event in sentence_events:
                            events.append((created_at, sentence_event))

        return events

    def chunkify_events_by_timeslots(self, events, duration):
        slot_starts_at = None
        event_chunks = []
        event_chunk = []
        for event in events:
            created_at = datetime.strptime(event[0], '%d-%m-%Y %H:%M')

            if slot_starts_at is None:
                slot_starts_at = created_at

            if len(event_chunk
                   ) > 0 and created_at > slot_starts_at + timedelta(
                       0, duration):
                event_chunks.append(event_chunk)
                event_chunk = []
                slot_starts_at = created_at
            event_chunk.append(event)
        event_chunks.append(event_chunk)
        return event_chunks
Exemplo n.º 24
0
import timeit
import collections

from utils.utilities import Utilities
from preprocessing.preprocesssor import Preprocessor
from causality_detection.causal_stength_calculator import CausalStrengthCalculator
from causality_detection.itemsest_causality import ItemsetCausality


if __name__ == "__main__":
    start_time = timeit.default_timer()
    event_file_path = 'events.csv'
    utilities = Utilities()
    causal_strength_calculator = CausalStrengthCalculator()
    itemset_causality = ItemsetCausality()
    preprocessor = Preprocessor(params=['lower', 'lemmatize'])

    rows = utilities.read_from_csv(event_file_path)
    header = rows[0]
    del rows[0]

    events_phrases = []
    for row in rows:
        phrases = [phrase.strip() for phrase in row[header.index('event_phrases')].split(',')]
        events_phrases += phrases

    sorted_event_phrases = collections.Counter(events_phrases).most_common()
    low_freq_events = [event[0] for event in sorted_event_phrases if event[1] <= 5]

    event_rows = []
    for row in rows:
Exemplo n.º 25
0
from utils.utilities import Utilities
from preprocessing.event_extractor import EventExtractor
import time

if __name__ == '__main__':
    # Original Code
    #
    # utilities = Utilities()
    # event_extraction = EventExtractor()
    # #
    # # event_extraction.save_texts_in_file()
    #
    # events = event_extraction.extract_events()
    #
    # for event in events:
    #     if len(event['event_phrases']) > 0:
    #         utilities.save_or_append_in_csv(event, 'events.csv')

    utilities = Utilities()
    event_extractor = EventExtractor()

    tweet_rows = event_extractor.get_unique_tweets()

    tweet_sentences = event_extractor.get_tweet_sentences(tweet_rows)

    events = event_extractor.extract_events2(tweet_sentences)
    events = sorted(events,
                    key=lambda x: time.strptime(x[0], '%d-%m-%Y %H:%M'))
    utilities.save_or_append_list_as_csv(events, 'events2.csv')
Exemplo n.º 26
0
 def __init__(self):
     self.data_source_file = None
     self.utilities = Utilities()
Exemplo n.º 27
0
class BackgroundDataCollection:

    def __init__(self):
        self.data_source_file = None
        self.utilities = Utilities()

    def set_data_source_file(self, source_file):
        self.data_source_file = source_file

    def remove_out_of_range_historic_urls(self, urls, date_from, date_to):
        """
        Remove out of range urls

        :param urls: list of urls
        :param date_from: date from
        :param date_to: date to
        :return: list of in the range urls
        """
        in_range_urls = []

        try:
            date_from = parser.parse(str(date_from))
            date_to = parser.parse(str(date_to))
        except ValueError:
            raise Exception("Invalid date range. Please input date in yyyymmdd format")

        for url in urls:
            if len(url) > 43:
                date_str = url[28:42]

                url_time = parser.parse(date_str)

                if date_from <= url_time <= date_to:
                    in_range_urls.append(url)

        return in_range_urls

    def collect_data(self, date_from, date_to):
        """
        Run the whole workflow for historical article collection within a range

        :param date_from: date from
        :param date_to: date to
        :return: list of articles
        """

        # os.makedirs(self.articles_base_dir, exist_ok=True)
        try:
            parser.parse(str(date_from))
            parser.parse(str(date_to))
        except ValueError:
            print("Invalid date format. Please provide date in yyyymmdd format.")
            return

        source_urls = self.utilities.read_lines_from_file(self.data_source_file)
        new_file_count = 0
        for source_url in source_urls:

            url_str = str(subprocess.run(
                ['waybackpack', source_url, '--list', '--from-date', str(date_from), '--to-date', str(date_to)],
                stdout=subprocess.PIPE).stdout.decode('utf-8'))
            urls = url_str.splitlines()

            print(urls)
            exit()
    def _OnWebSocketTextMsg(webSocket, msg):
        global Utilities, WifiHandler, Config
        import ujson

        print('WebSocket text message: %s' % msg)
        # webSocket.SendTextMessage('Received "%s"' % msg)

        try:
            params = ujson.loads(msg)

            if params["command"] == "identity":
                from utils.json_const import identity_result

                identity_result.update(
                    hardware_version=Config.HARDWARE_VERSION,
                    hardware_name=Config.HARDWARE_NAME,
                    mac_address=WifiHandler.get_mac_address(),
                    ip_address=WifiHandler.get_ip_address())

                webSocket.SendTextMessage(ujson.dumps(identity_result))
            elif params["command"] == "save_settings":
                from utils.json_const import save_settings_result_success, save_settings_result_failed
                from utils.settings_template import template

                settings = template.format(**params)
                # print(settings)
                with open("settings.py", "w") as file:
                    length = file.write(settings)

                    if length == len(settings):
                        webSocket.SendTextMessage(
                            ujson.dumps(save_settings_result_success))
                    else:
                        webSocket.SendTextMessage(
                            ujson.dumps(save_settings_result_failed))
            elif params["command"] == "reboot_device":
                Utilities.hard_reset()
            elif params["command"] == "check_wifi":
                from utils.json_const import check_wifi_result

                result_code = WifiHandler.set_sta_mode(params["wifi_ssid"],
                                                       params["wifi_password"],
                                                       timeout_sec=60,
                                                       for_test=True)

                check_wifi_result.update(result_code=result_code)

                webSocket.SendTextMessage(ujson.dumps(check_wifi_result))

                if result_code == WifiHandler.STATION_CONNECTED:
                    import urequests
                    from utils.json_const import check_internet_result_success, check_internet_result_failed

                    try:
                        res = urequests.get(Config.INTERNET_TESTING_URL,
                                            timeout=10.0)

                        if res:
                            if res.text == "Success":
                                webSocket.SendTextMessage(
                                    ujson.dumps(check_internet_result_success))
                            else:
                                webSocket.SendTextMessage(
                                    ujson.dumps(check_internet_result_failed))
                        else:
                            webSocket.SendTextMessage(
                                ujson.dumps(check_internet_result_failed))
                    except Exception:
                        webSocket.SendTextMessage(
                            ujson.dumps(check_internet_result_failed))
            elif params["command"] == "check_mqtt":
                from umqtt.simple import MQTTClient
                from utils.json_const import check_mqtt_result_success, check_mqtt_result_failed

                def sub_cb(topic, msg):
                    pass

                mqtt_client = MQTTClient(params["client_id"], params["host"],
                                         int(params["port"]),
                                         params["username"],
                                         params["password"],
                                         int(params["keepalive"]))

                try:
                    username = params["bigiot_username"] if bool(
                        params["is_bigiot"]) else params["client_id"]

                    mqtt_client.set_callback(sub_cb)
                    print("check_mqtt_result:", mqtt_client.connect(True))
                    print(
                        "test subscribe:",
                        mqtt_client.subscribe(
                            "{}/data".format(username).encode()))
                    print(
                        "test publish:",
                        mqtt_client.publish(
                            "{}/data".format(username).encode(), "world"))
                    mqtt_client.disconnect()

                    webSocket.SendTextMessage(
                        ujson.dumps(check_mqtt_result_success))
                except Exception as e:
                    print(str(e))

                    # e == 5, authorized failed, means device number or device authorize wrong
                    if str(e) == "5":
                        check_mqtt_result_failed.update(
                            error_code="5",
                            error_msg=
                            "Authorized failed, check Username and Password")
                    # e == 128, sub failed, means client_id or topic auth(username/data) wrong
                    elif str(e) == "128":
                        check_mqtt_result_failed.update(
                            error_code="128",
                            error_msg=
                            "Subscribe failed, check Bigiot Username and Client ID"
                        )
                    else:
                        check_mqtt_result_failed.update(
                            error_code=str(e),
                            error_msg="Unknown error: {}".format(str(e)))

                    webSocket.SendTextMessage(
                        ujson.dumps(check_mqtt_result_failed))
        except ValueError:
            webSocket.SendTextMessage("Params Format Error")

        gc.collect()
Exemplo n.º 29
0
from utils.utilities import Utilities
from database.db_operations import DbOperations
from required_files.config import app_config

if __name__ == '__main__':

    db_operations = DbOperations()
    utilities = Utilities()

    items = db_operations.get_all_item()
    data_file = app_config['data_file']
    for item in items:
        utilities.save_or_append_in_csv(item, data_file)
Exemplo n.º 30
0
    def __sub_cb(self, topic, msg):
        if topic != self._topic:
            return

        print("msg: {}".format(msg))

        try:
            json_obj = json.loads(str(msg, "utf-8"))
            command = json_obj['command']
            general_result = {
                'command': command + '_result',
                'mac_address': json_obj['mac_address'],
                'result': 'success'
            }

            if command == "wake_up_pc":
                for count in range(3):
                    wake_on_lan(json_obj['mac_address'])

                general_result['title'] = json_obj['title']
                general_result['mac_address'] = WifiHandler.get_mac_address()
                self._client.publish(topic, json.dumps(general_result))
            elif command == 'device_remove':
                if json_obj['mac_address'] != WifiHandler.get_mac_address():
                    return

                general_result['title'] = json_obj['title']
                self._client.publish(topic, json.dumps(general_result))

                Utilities.del_settings_file()
                Utilities.hard_reset()
            elif command == 'sync_datetime':
                if json_obj['mac_address'] != WifiHandler.get_mac_address():
                    return

                datetime = json_obj['datetime']
                RTC().datetime((
                    datetime['year'],
                    datetime['month'],
                    datetime['day'],
                    datetime['weekday'],  # 0~6
                    datetime['hour'],
                    datetime['minute'],
                    datetime['second'],
                    datetime['millisecond']))

                self._client.publish(topic, json.dumps(general_result))

                print("datetime: %02d-%02d-%02d %02d:%02d:%02d" %
                      ((localtime()[:-2])))
            elif command == 'device_reboot':
                if json_obj['mac_address'] != WifiHandler.get_mac_address():
                    return

                Utilities.hard_reset()
            elif command == 'report_error_log':
                if json_obj['mac_address'] != WifiHandler.get_mac_address():
                    return

                general_result['logs'] = Utilities.read_logs()
                self._client.publish(topic, json.dumps(general_result))
        except ValueError:
            pass
        except KeyError as ke:
            print("KeyError:", ke)

        gc.collect()