Пример #1
0
 def count_word(self):
     """
     Count stop words and output statistics.
     """
     word_list = []
     result = ETL().extract_word()
     count, total = 0, len(result)
     for row in result:
         count += 1
         test_id = row[0]
         print(f"{test_id}, {count}/{total}")
         try:
             dump = ETL().extract_cdb(test_id)
             processed = Process(dump).internal_process()
         except (IndexError, UnicodeDecodeError):
             continue
         if "\n\n" in dump:
             exceptions = dump[dump.index("\n\n") + len("\n\n"):]
             try:
                 header = "exception throw location:\n"
                 stack = exceptions[exceptions.index(header) + len(header):]
             except ValueError:
                 continue
             # extract root cause from exceptions
             if dump.count(header) > 1:
                 stack = stack[:stack.index("\n\n")]
             roots = re.findall(r"^\d+:[ ](.+)[ ]at[ ].+", stack, re.M)
             words = self.obtain_word(roots, processed)
             word_list += words
     Log().chart_print(Counter(word_list).most_common(10))
Пример #2
0
def enrich(plugins, uri, wait=0):

    if wait:
        time.sleep(wait)

    etl = ETL()
    etl.read_configfile('/etc/opensemanticsearch/etl')
    etl.read_configfile('/etc/opensemanticsearch/enhancer-rdf')

    etl.config['plugins'] = plugins.split(',')

    filename = uri

    # if exist delete protocoll prefix file://
    if filename.startswith("file://"):
        filename = filename.replace("file://", '', 1)

    parameters = etl.config.copy()

    parameters['id'] = uri
    parameters['filename'] = filename

    parameters, data = etl.process(parameters=parameters, data={})

    return data
Пример #3
0
    def export_row_data_to_index(self, data, rownumber):

        parameters = self.config.copy()

        # todo: all content plugins configurated, not only this one
        parameters['plugins'] = [
            'enhance_path',
        ]

        etl = ETL()

        try:

            etl.process(parameters=parameters, data=data)

        # if exception because user interrupted by keyboard, respect this and abbort
        except KeyboardInterrupt:
            raise KeyboardInterrupt
        except BaseException as e:
            sys.stderr.write("Exception adding CSV row {} : {}".format(
                rownumber, e.message))

            if 'raise_pluginexception' in self.config:
                if self.config['raise_pluginexception']:
                    raise e
Пример #4
0
 def update_mapping_new_product_id(self):
     logging.info("Create new product id after pruning dataset")
     _r_map = self.dataset[['PRODUCT_ID', 'NEW_PRODUCT_ID']]
     mm = [_r_map['PRODUCT_ID'].unique(), _r_map['NEW_PRODUCT_ID'].unique()]
     mapping = pd.DataFrame(data=np.array(mm).T,
                            columns=["product_id", "new_product_id"])
     logging.info("Save new product id into db")
     ETL(DATA_PATH, CONFIG_PATH,
         SCHEMA_PATH).insert_new_product_id_table(mapping)
Пример #5
0
def main():
    etl = ETL()
    etl.extract()
    etl.transform()
    kmeans_model = KMeansModel(etl.observations,'modeling_text',
                               n_clusters=3,
                               n_features=1000)
    kmeans_model.vectorize()
    kmeans_model.apply_lsa(n_components=50)
    kmeans_model.run()
    kmeans_model.get_metrics()
Пример #6
0
 def setUp(self):
     self.etl = ETL()
     data = {
         'customer_id': [1, 1],
         'order_id': [1, 2],
         'order_item_id': [5, 6],
         'num_items': [2, 3],
         'revenue': [90, 50],
         'created_at_date': [datetime(2017, 10, 5),
                             datetime(2017, 10, 12)]
     }
     self.test_df = pd.DataFrame.from_dict(data)
Пример #7
0
def ETI_TEST():
    print("Staring ETL Test  Job !!!!")
    var = Variable()
    var.INPUT_DATA = "/input/employee.csv"
    csvParser = csv_parser(var.INPUT_DATA)

    if csvParser.file_exist(var.INPUT_DATA):
        TABLE_NAME = csvParser.get_table_name(
            var.INPUT_DATA)  # Takes Table name from Filename

        if var.CREATE_TABLE and not var.RELATION:
            ob = ETL(TABLE_NAME, var.INPUT_DATA)
            ob.etl_process(csvParser.check_header(), 20)
Пример #8
0
def main():

    print("Staring ETL Job !!!!")
    var = Variable()
    csvParser = csv_parser(var.INPUT_DATA)

    if csvParser.file_exist(var.INPUT_DATA):
        TABLE_NAME = csvParser.get_table_name(
            var.INPUT_DATA)  # Takes Table name from Filename

        if var.CREATE_TABLE and not var.RELATION:
            ob = ETL(TABLE_NAME, var.INPUT_DATA)
            ob.etl_process(csvParser.check_header())
Пример #9
0
def main():
    parser = argparse.ArgumentParser(
        description='CLI for the pldb application.')
    parser.add_argument(
        '--update',
        action='store_true',
        help='Download season data and update the JSON file and the database.')
    parser.add_argument(
        '--table',
        action='store_true',
        help=
        'Display the current standings table (calculated from data in the database).'
    )
    parser.add_argument(
        '--club',
        type=str,
        default=None,
        help=
        'Display info for all the matches for the given club in the season.')
    args = parser.parse_args()

    if args.update:
        print("Updating season data...")
        etl = ETL()
        etl.run()
        print("done.")
    elif args.table:
        query = Query()
        table_data = query.table()
        print("#\tClub\tPlayed\tWon\tDrawn\tLost\tGD\tPoints")
        for rank in range(len(table_data)):
            row = table_data[rank]
            print(
                f"{rank + 1}\t{row['club']}\t{row['matches_played']}\t"
                f"{row['wins']}\t{row['draws']}\t{row['losses']}\t{row['goal_diff']}\t"
                f"{row['points']}")
    elif args.club:
        query = Query()
        for match in query.club(args.club):
            kick_time = match['kickoff'] / 1000
            kick_time = datetime.datetime.fromtimestamp(kick_time).strftime(
                "%a %d %b %H:%M")
            if match['status'] == 'C':
                score = f"{match['away_goals']} {match['home_goals']}"
            else:
                score = ' @ '
            print(
                f"{kick_time} {match['away_club']['abbr']} "
                f"{score} {match['home_club']['abbr']} {match['ground']['name']}"
            )
Пример #10
0
def main():
    """The `main` function, to be called from commandline, like `python src/main.py -c etl.cfg`.

    Args:
       -c  --config <config_file>  the Stetl config file.
       -s  --section <section_name> the section in the Stetl config (ini) file to execute (default is [etl]).
       -a  --args <arglist> substitutable args for symbolic, {arg}, values in Stetl config file, in format "arg1=foo arg2=bar" etc.

    """
    args = parse_args()

    # Do the ETL
    etl = ETL(vars(args), args.config_args)
    etl.run()
Пример #11
0
 def __init__(self, pruning_method=None):
     self.c_users = pd.DataFrame(
         data=ETL(DATA_PATH, CONFIG_PATH,
                  SCHEMA_PATH).select_complaints_users_from_db(),
         columns=['COMPLAINT_ID', 'COMPLAINT_TEXT', 'PRODUCT_ID'])
     p_id = self.pruning_product_list(pruning_method=pruning_method)
     self.dataset = self.pruning_data_set(p_id)
     self.update_mapping_new_product_id()
     texts = self.clean_text()
     self.nnds = NNetDS()
     self.prepare_tf_data_set(texts)
     self.nnds.embedding_matrix = EmbeddingGlove(
         MAX_WORDS=MAX_WORDS,
         MAX_SEQ_LENGTH=MAX_SEQ_LENGTH,
         GLOV_EMBEDDING_DIM=GLOV_EMBEDDING_DIM,
         word_index=self.nnds.word_index).get_matrix()
     self.build_tf_model()
Пример #12
0
    def __init__(self):
        dirname = os.path.dirname(__file__)

        # Script configuration
        os.environ['FABRIC_URL'] = 'http://localhost:3000/'
        os.environ['STORAGE_DIR'] = os.path.join(dirname, 'tmp')
        os.environ['USER_LIST'] = os.path.join(dirname, 'data/user.csv')
        os.environ['USER_STORAGE'] = os.path.join(os.getenv('STORAGE_DIR'),
                                                  'user.csv')
        os.environ['HOLIDAY_CALENDAR'] = os.path.join(
            dirname, 'data/thHoliday2563-64.csv')

        # Import Grouped command
        self.user = User()
        self.fabric = Fabric()
        self.service = Service()
        self.bid = Bid()
        self.etl = ETL()
Пример #13
0
    def __init__(self, name, question, db_table, row_handler, answer_cql):
        self._name = name
        self._question = question
        self._db_table = db_table
        self._answer_cql = answer_cql
        self._etl = ETL(
            f"{name} ETL",
            [helper.join_path(TEMP_DIR, 'staging.csv')],
            {
                "target": 'Cassandra',
                "table": db_table["table_name"],
                "is_file": False
            },
            [i[0] for i in db_table["cols"]],
            row_handler,
        )

        logger.info(f"{self._name} - Question: {self._question}")
Пример #14
0
def main():
    """The `main` function, to be called from commandline, like `python src/main.py -c etl.cfg`.

    Args:
       -c  --config <config_file>  the Stetl config file.
       -s  --section <section_name> the section in the Stetl config (ini) file to execute (default is [etl]).
       -a  --args <arglist> substitutable args for symbolic, {arg}, values in Stetl config file, in format "arg1=foo arg2=bar" etc.
       -d  --doc <class> Get component documentation like its configuration parameters, e.g. stetl --doc stetl.inputs.fileinput.FileInput
       -h  --help get help info

    """
    args = parse_args()

    if args.config_file:
        # Do the ETL
        etl = ETL(vars(args), args.config_args)
        etl.run()

    elif args.doc_args:
        print_doc(args.doc_args)
    else:
        print('Unknown option, try stetl -h for help')
Пример #15
0
    def test_blacklist(self):

        etl = ETL()
        etl.config['plugins'] = [
            'enhance_entity_linking', 'enhance_extract_law'
        ]
        etl.config['raise_pluginexception'] = True
        data = {}
        data['content_txt'] = "\n".join(["No clause for law code alias CC"])

        parameters, data = etl.process(
            parameters={'id': 'test_enhance_extract_law'}, data=data)

        self.assertFalse('Swiss Civil Code' in data['law_code_ss'])

        data['content_txt'] = "\n".join([
            "No clause for blacklisted law code alias CC but not blacklisted label of this alias: Swiss Civil Code"
        ])

        parameters, data = etl.process(
            parameters={'id': 'test_enhance_extract_law'}, data=data)

        self.assertTrue('Swiss Civil Code' in data['law_code_ss'])
Пример #16
0
 def detect_sim(self):
     """
     Detect crash dump similarity and output the comparison result.
     """
     message = []
     order_pair, block_pair = [], []
     for param in self.params:
         # parameter is test_id
         if re.match(r"^\d{9,}$", param):
             dump = ETL().extract_cdb(param)
             processed = Process(dump).internal_process()
         # parameter is dump_path
         else:
             with open(param, "r", encoding="utf-8") as fp:
                 dump = fp.read()
             processed = Process(dump).pre_process()
         cpnt_order, func_block = Knowledge(processed).add_knowledge()
         message.extend([cpnt_order, func_block])
         order_pair.append(cpnt_order)
         block_pair.append(func_block)
     # output dump comparison
     Log().dump_print(message)
     Calculate(order_pair, block_pair).calculate_sim(debug=True)
Пример #17
0
    def test(self):

        etl = ETL()
        etl.config['plugins'] = [
            'enhance_entity_linking', 'enhance_extract_law'
        ]
        etl.config['raise_pluginexception'] = True
        data = {}
        data['content_txt'] = "\n".join([
            "abc § 888 xyz"
            "abc § 987 b xyz"
            "§12", "§ 123", "§345a", "§456 b", "§ 567 c",
            "BGB § 153 Abs. 1 Satz 2",
            "§ 52 Absatz 1 Nummer 2 Buchstabe c STGB", "§ 444 CC"
        ])

        # run ETL of test.pdf with configured plugins and PDF OCR (result of etl_file.py)
        parameters, data = etl.process(
            parameters={'id': 'test_enhance_extract_law'}, data=data)

        self.assertTrue('§ 888' in data['law_clause_ss'])
        self.assertTrue('§ 987 b' in data['law_clause_ss'])
        self.assertTrue('§ 12' in data['law_clause_ss'])
        self.assertTrue('§ 123' in data['law_clause_ss'])
        self.assertTrue('§ 345a' in data['law_clause_ss'])
        self.assertTrue('§ 456 b' in data['law_clause_ss'])
        self.assertTrue('§ 567 c' in data['law_clause_ss'])

        self.assertTrue('§ 153 Abs. 1 Satz 2' in data['law_clause_ss'])
        self.assertTrue(
            '§ 52 Absatz 1 Nummer 2 Buchstabe c' in data['law_clause_ss'])

        self.assertTrue('Strafgesetzbuch' in data['law_code_ss'])
        self.assertTrue('Bürgerliches Gesetzbuch' in data['law_code_ss'])

        self.assertTrue('Swiss Civil Code' in data['law_code_ss'])
Пример #18
0
 def __init__(self):
     self.etl = ETL()
Пример #19
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

from etl import ETL

if __name__ == "__main__":
    etl = ETL()

    # monthly
    etl.check_monthly_ranking()

    etl.check_new_actress()

    # daily
    etl.check_new_works()
Пример #20
0
 def get_product_name(self, text):
     array_text = self.clean_text(text)
     new_id = self.get_new_product_id(array_text)
     return ETL(DATA_PATH, CONFIG_PATH,
                SCHEMA_PATH).select_product_name(new_id)
Пример #21
0
 def test_init(self):
     dl = ETL()
    def process(self, parameters=None, data=None):
        if parameters is None:
            parameters = {}
        if data is None:
            data = {}

        verbose = False
        if 'verbose' in parameters:
            if parameters['verbose']:
                verbose = True

        if 'id' in data:
            docid = data['id']
        else:
            docid = parameters['id']

        # default classifier
        classifier = 'en_core_web_sm'

        if 'spacy_ner_classifier_default' in parameters:
            classifier = parameters['spacy_ner_classifier_default']

        # set language specific classifier, if configured and document language detected
        if 'spacy_ner_classifiers' in parameters and 'language_s' in data:
            # is a language speciic cassifier there for the detected language?
            if data['language_s'] in parameters['spacy_ner_classifiers']:
                classifier = parameters['spacy_ner_classifiers'][
                    data['language_s']]

                analyse_fields = ['content_txt', 'ocr_t', 'ocr_descew_t']

        text = ''
        for field in analyse_fields:
            if field in data:
                text = "{}{}\n".format(text, data[field])

        # extract sentences from text
        url = "http://localhost:8080/sents"
        if os.getenv('OPEN_SEMANTIC_ETL_SPACY_SERVER'):
            url = os.getenv('OPEN_SEMANTIC_ETL_SPACY_SERVER') + '/sents'

        headers = {'content-type': 'application/json'}
        d = {'text': text, 'model': classifier}

        response = requests.post(url, data=json.dumps(d), headers=headers)
        sentences = response.json()

        etl = ETL()

        sentencenumber = 0

        for sentence in sentences:

            sentencenumber += 1

            partdocid = docid + '#sentence' + str(sentencenumber)

            partparameters = parameters.copy()
            partparameters['plugins'] = [
                'enhance_path', 'enhance_detect_language_tika_server',
                'enhance_entity_linking', 'enhance_multilingual'
            ]

            if 'enhance_ner_spacy' in parameters['plugins']:
                partparameters['plugins'].append('enhance_ner_spacy')
            if 'enhance_ner_stanford' in parameters['plugins']:
                partparameters['plugins'].append('enhance_ner_stanford')

            sentencedata = {}
            sentencedata['id'] = partdocid

            sentencedata['container_s'] = docid

            if 'author_ss' in data:
                sentencedata['author_ss'] = data['author_ss']

            sentencedata['content_type_group_ss'] = "Sentence"
            sentencedata['content_type_ss'] = "Sentence"
            sentencedata['content_txt'] = sentence

            # index sentence
            try:
                partparameters, sentencedata = etl.process(
                    partparameters, sentencedata)

            except BaseException as e:
                sys.stderr.write("Exception adding sentence {} : {}".format(
                    sentencenumber, e))

        data['sentences_i'] = sentencenumber

        return parameters, data
Пример #23
0
        Args:
            df (object): Pandas DataFrame

        Returns:
            df (object): Pandas DataFrame with median filled NaNs
        '''
        categoricals = []
        for col in list(df.columns):
            if df[col].dtype.name in ['object', 'category']:
                categoricals.append(col)

        for col in list(df.columns):
            if col not in categoricals:
                df[col] = df[col].fillna(df[col].median())
        return df


if __name__ == "__main__":
    path = '../data/'
    d = Data(path)
    d.get_data()
    df = d.clean_data()
    fe = Feature_Extractor()
    training_data = fe.extract_features(df, verbose=True)
    utils.save_csv(training_data, path, 'training_data')
    etl = ETL(connection, data_path, schema_path, engine,
              df_to_write=training_data, table_name="training_data",
              remove=False, create=False, load=False, verbose=True)
    etl.pipeline()
Пример #24
0
    def etl_graph(self, parameters):

        # Print infos
        if self.verbose:
            print("Graph has {} triples.".format(len(self.graph)))

        count = 0

        part_parameters = {}

        part_parameters['plugins'] = []

        # todo like enhance_path for properties & subjects?
        # abstract variable of enhance_path plugin?

        part_parameters['export'] = parameters['export']

        # since there can be multiple triples/values for same property,
        # do not overwrite but add value to existent values of the facet/field/property
        part_parameters['add'] = True

        for subj, pred, obj in self.graph:

            part_data = {}
            part_data['content_type'] = 'Knowledge graph'

            count += 1

            if self.verbose:
                print("Importing triple {}".format(count))

            try:

                # subject as URI/ID
                part_parameters['id'] = subj

                if self.verbose:
                    print("ID (RDF subject): {}".format(subj))

                #
                # Predicate/property to facet/field
                #
                rdf_property = pred

                # set Solr datatype so facets not available yet in Solr schema can be inserted automatically (dynamic fields) with right datatype
                facet = rdf_property + '_ss'

                if self.verbose:
                    print("Facet: {}".format(facet))
                #
                # object to facet/field value
                #

                value = self.get_labels_from_rdfobject(obj)

                # insert or append value (object of triple) to data
                part_data[facet] = value

                #
                # Property statistics
                #

                # add to facet property where you can see which properties are available
                part_data['property_ss'] = pred

                # todo: set parameter to add instead of update for multiple triples/values for/with same property

                etl = ETL()

                etl.verbose = self.verbose

                # index triple
                etl.process(part_parameters, part_data)

            except KeyboardInterrupt:
                raise KeyboardInterrupt

            except BaseException as e:
                sys.stderr.write("Exception while triple {}: {}\n".format(
                    count, e))
Пример #25
0
	def etl_graph(self, parameters):

		if self.verbose:
			print("Graph has {} triples.".format(len(self.graph)) )
	
		count_triple = 0
		count_subjects = 0
	
		part_parameters = {}
		part_parameters['plugins'] = []
		part_parameters['export'] = parameters['export']
						
		property2facet = {}
		if 'property2facet' in parameters:
			property2facet = parameters['property2facet']

		etl_processor = ETL()
		etl_processor.verbose = self.verbose
		
		class_properties = []
		class_properties.append(rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'))
		class_properties.append(rdflib.term.URIRef(u'http://www.wikidata.org/prop/direct/P31'))
		# since there can be multiple triples/values for same property,
		# do not overwrite document but add value to existent document & values of the facet/field/property
		part_parameters['add'] = True
		# but not for the field content_type which doesn't change and is not multi valued
		part_parameters['fields_set'] = "content_type"

		# use SPARQL query with distinct to get subjects only once
		res = self.graph.query(
			"""SELECT DISTINCT ?subject
			WHERE {
			?subject ?predicate ?object .
			}""")
	
		for row in res:

			count_subjects += 1
	
			if self.verbose:
				print( "Importing entity / subject {}".format(count_subjects) )

			# get subject of the concept from first column
			subj = row[0]

			if self.verbose:
				print ( "Processing RDF subject {}".format(subj) )

			part_data = {}
			part_data['content_type'] = 'Knowledge graph'
			part_data['content_type_group'] = 'Knowledge graph'
			# subject as URI/ID
			part_parameters['id'] = subj
			
			preferred_label = self.get_preferred_label(subject=subj)
			part_data['title'] = preferred_label
			
			count_subject_triple = 0

			# get all triples for this subject
			for pred, obj in self.graph.predicate_objects(subject=subj):

				count_triple += 1
				count_subject_triple += 1

				if self.verbose:
					print( "Importing subjects triple {}".format(count_subject_triple) )
					print( "Predicate / property: {}".format(pred) )
					print( "Object / value: {}".format(obj) )


				try:
					
					# if class add preferredlabel of this entity to facet of its class (RDF rdf:type or Wikidata "instance of" (Property:P31)),
					# so its name (label) will be available in entities view and as filter for faceted search
					
					if pred in class_properties:
						class_facet = str(obj)
						# map class to facet, if mapping for class exist
						if class_facet in property2facet:
							class_facet = property2facet[class_facet]
						etl.append(data=part_data, facet=class_facet, values=preferred_label)			

					#
					# Predicate/property to facet/field
					#

					# set Solr datatype strings so facets not available yet in Solr schema can be inserted automatically (dynamic fields) with right datatype
					
					facet = pred + '_ss'
					facet_uri = facet + '_uri_ss'
					facet_preferred_label_and_uri = facet + '_preflabel_and_uri_ss'
					
					if self.verbose:
						print ( "Facet: {}".format(facet) )

	
					#
					# get values or labels of this object
					#

					values = self.get_values(obj=obj)
					if self.verbose:
						print ( "Values: {}".format(values) )

					# insert or append value (object of triple) to data
					etl.append(data=part_data, facet=facet, values=values)
					

					# if object is reference/URI append URI
					if type(obj) == rdflib.URIRef:
						
						uri = obj
						
						etl.append( data=part_data, facet=facet_uri, values=uri )

						# append mixed field with preferred label and URI of the object for disambiguation of different Entities/IDs/URIs with same names/labels in faceted search
						preferredlabel_and_uri = "{} <{}>".format ( self.get_preferred_label(subject=obj), obj)

					else:
						preferredlabel_and_uri = self.get_preferred_label(subject=obj)
					
					etl.append(data=part_data, facet=facet_preferred_label_and_uri, values=preferredlabel_and_uri)


				except KeyboardInterrupt:
					raise KeyboardInterrupt
	
				except BaseException as e:
					sys.stderr.write( "Exception while triple {} of subject {}: {}\n".format(count_subject_triple, subj, e) )
	
	
			# index triple
			etl_processor.process( part_parameters, part_data)
    def process(self, parameters=None, data=None):
        if parameters is None:
            parameters = {}
        if data is None:
            data = {}

        verbose = False
        if 'verbose' in parameters:
            if parameters['verbose']:
                verbose = True

        if 'id' in data:
            docid = data['id']
        else:
            docid = parameters['id']

        # default classifier
        classifier = 'en_core_web_sm'

        if 'spacy_ner_classifier_default' in parameters:
            classifier = parameters['spacy_ner_classifier_default']

        # set language specific classifier, if configured and document language detected
        if 'spacy_ner_classifiers' in parameters and 'language_s' in data:
            # is a language speciic cassifier there for the detected language?
            if data['language_s'] in parameters['spacy_ner_classifiers']:
                classifier = parameters['spacy_ner_classifiers'][data['language_s']]

                analyse_fields = ['content_txt', 'ocr_t', 'ocr_descew_t']

        text = ''
        for field in analyse_fields:
            if field in data:
                text = "{}{}\n".format(text, data[field])

        # extract sentences from text
        url = "http://localhost:8080/sents"
        if os.getenv('OPEN_SEMANTIC_ETL_SPACY_SERVER'):
            url = os.getenv('OPEN_SEMANTIC_ETL_SPACY_SERVER') + '/sents'

        headers = {'content-type': 'application/json'}
        d = {'text': text, 'model': classifier}

        retries = 0
        retrytime = 1
        # wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry
        retrytime_max = 120
        no_connection = True

        while no_connection:
            try:
                if retries > 0:
                    print(
                        'Retrying to connect to Spacy services in {} second(s).'.format(retrytime))
                    time.sleep(retrytime)
                    retrytime = retrytime * 2
                    if retrytime > retrytime_max:
                        retrytime = retrytime_max

                response = requests.post(url, data=json.dumps(d), headers=headers)

                # if bad status code, raise exception
                response.raise_for_status()

                no_connection = False

            except requests.exceptions.ConnectionError as e:
                retries += 1
                sys.stderr.write(
                    "Connection to Spacy services (will retry in {} seconds) failed. Exception: {}\n".format(retrytime, e))

        sentences = response.json()

        etl = ETL()

        sentencenumber = 0

        for sentence in sentences:

            sentencenumber += 1

            partdocid = docid + '#sentence' + str(sentencenumber)

            partparameters = parameters.copy()
            partparameters['plugins'] = ['enhance_path', 'enhance_detect_language_tika_server',
                                         'enhance_entity_linking', 'enhance_multilingual']

            if 'enhance_ner_spacy' in parameters['plugins']:
                partparameters['plugins'].append('enhance_ner_spacy')
            if 'enhance_ner_stanford' in parameters['plugins']:
                partparameters['plugins'].append('enhance_ner_stanford')

            sentencedata = {}
            sentencedata['id'] = partdocid

            sentencedata['container_s'] = docid

            if 'author_ss' in data:
                sentencedata['author_ss'] = data['author_ss']

            sentencedata['content_type_group_ss'] = "Sentence"
            sentencedata['content_type_ss'] = "Sentence"
            sentencedata['content_txt'] = sentence

            # index sentence
            try:
                partparameters, sentencedata = etl.process(
                    partparameters, sentencedata)

            except BaseException as e:
                sys.stderr.write(
                    "Exception adding sentence {} : {}".format(sentencenumber, e))

        data['sentences_i'] = sentencenumber

        return parameters, data
Пример #27
0
parser.add_argument("--crawl",
                    nargs="?",
                    const=True,
                    help="Crawling recent crash dumps.")
parser.add_argument("--train",
                    nargs="?",
                    const=True,
                    help="Training for parameter tuning.")
parser.add_argument("--stop",
                    nargs="?",
                    const=True,
                    help="Count file names that can be filtered.")
parser.add_argument("--detect", nargs=2, help="Detect crash dump similarity.")
args = parser.parse_args()
# suppress warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

if __name__ == "__main__":
    # crawling recent crash dumps
    if args.crawl:
        ETL().load()
    # training for parameter tuning
    if args.train:
        Train().training()
    # count file names that can be filtered
    if args.stop:
        StopWord().count_word()
    # detect crash dump similarity
    if args.detect:
        Detect(args.detect).detect_sim()
Пример #28
0
                      weight_decay=args.rmsprop_decay)
g_losses = np.empty(0)

print("Initializing discriminator model and optimizer.")
d_net = Discriminator().cuda()
d_opt = optim.RMSprop(d_net.parameters(),
                      args.learning_rate_d,
                      weight_decay=args.rmsprop_decay)
d_losses = np.empty(0)

if args.retrain:
    g_net.load_state_dict(torch.load('../data/generator_state'))
    d_net.load_state_dict(torch.load('../data/discriminator_state'))

print("Beginning training..")
loader = ETL(args.batch_size, args.image_size, args.path)

for iteration in range(args.iterations):

    # Train discriminator
    for _ in range(args.k_discriminator):
        d_opt.zero_grad()

        d_examples, d_targets = loader.next_batch()
        d_noise = torch.Tensor(args.batch_size, 1, args.image_size,
                               args.image_size).uniform_(-1., 1.)
        d_noise = Variable(d_noise).cuda()
        d_samples = g_net(d_noise, d_examples).detach()

        d_real_pred = d_net(d_targets)
        d_fake_pred = d_net(d_samples)
Пример #29
0
class Connector_Hypothesis(ETL):

    verbose = False

    documents = True

    token = None

    api = 'https://hypothes.is/api/'

    # how many annotations to download at once / per page
    limit = 10

    # initialize Open Semantic ETL
    etl = ETL()
    etl.read_configfile('/etc/etl/config')
    etl.read_configfile('/etc/opensemanticsearch/etl')
    etl.read_configfile('/etc/opensemanticsearch/hypothesis')
    etl.verbose = verbose

    exporter = export_solr.export_solr()

    #
    # index the annotated document, if not yet in index
    #

    def etl_document(self, uri):

        result = True
        doc_mtime = self.exporter.get_lastmodified(docid=uri)

        if doc_mtime:

            if self.verbose:
                print(
                    "Annotated document in search index. No new indexing of {}"
                    .format(uri))

        else:
            # Download and Index the new or updated uri

            if self.verbose:
                print(
                    "Annotated document not in search index. Start indexing of {}"
                    .format(uri))

            try:
                etl = Connector_Web()
                etl.index(uri=uri)
            except KeyboardInterrupt:
                raise KeyboardInterrupt
            except BaseException as e:
                sys.stderr.write("Exception while getting {} : {}".format(
                    uri, e))
                result = False
        return result

    #
    # import an annotation
    #

    def etl_annotation(self, annotation):

        parameters = {}
        parameters['plugins'] = ['enhance_multilingual']

        # since there can be multiple annotations for same URI,
        # do not overwrite but add value to existent values of the facet/field/property
        parameters['add'] = True
        data = {}

        # id/uri of the annotated document, not the annotation id
        parameters['id'] = annotation['uri']

        # first index / etl the webpage / document that has been annotated if not yet in index
        if self.documents:
            result = self.etl_document(uri=annotation['uri'])
        if not result:
            data[
                'etl_error_hypothesis_ss'] = "Error while indexing the document that has been annotated"

        # annotation id
        data['annotation_id_ss'] = annotation['id']

        data['annotation_text_txt'] = annotation['text']

        tags = []
        if 'tags' in annotation:

            if self.verbose:
                print("Tags: {}".format(annotation['tags']))

            for tag in annotation['tags']:
                tags.append(tag)
        data['annotation_tag_ss'] = tags

        # write annotation to database or index
        self.etl.process(parameters=parameters, data=data)

    #
    # import all annotations since last imported annotation
    #

    def etl_annotations(self,
                        last_update="",
                        user=None,
                        group=None,
                        tag=None,
                        uri=None):

        newest_update = last_update

        if not self.api.endswith('/'):
            self.api = self.api + '/'

        searchurl = '{}search?limit={}&sort=updated&order=desc'.format(
            self.api, self.limit)

        if user:
            searchurl += "&user={}".format(user)

        if group:
            searchurl += "&group={}".format(group)

        if tag:
            searchurl += "&tag={}".format(tag)

        if uri:
            searchurl += "&uri={}".format(uri)

        # Authorization
        headers = {'user-agent': 'Open Semantic Search'}

        if self.token:
            headers['Authorization'] = 'Bearer ' + self.token

        # stats
        stat_downloaded_annotations = 0
        stat_imported_annotations = 0
        stat_pages = 0

        offset = 0
        last_page = False

        while not last_page:

            searchurl_paged = searchurl + "&offset={}".format(offset)

            # Call API / download annotations
            if self.verbose:
                print("Calling hypothesis API {}".format(searchurl_paged))

            request = requests.get(searchurl_paged, headers=headers)

            result = json.loads(request.content.decode('utf-8'))

            stat_pages += 1

            if len(result['rows']) < self.limit:
                last_page = True

            # import annotations
            for annotation in result['rows']:

                stat_downloaded_annotations += 1

                if annotation['updated'] > last_update:

                    if self.verbose:
                        print(
                            "Importing new annotation {}annotations/{}".format(
                                self.api, annotation['id']))
                        print(annotation['text'])

                    stat_imported_annotations += 1

                    # save update time from newest annotation/edit
                    if annotation['updated'] > newest_update:
                        newest_update = annotation['updated']

                    self.etl_annotation(annotation)

                else:

                    last_page = True

            offset += self.limit

        # commit to index, if yet buffered
        self.etl.commit()

        if self.verbose:
            print("Downloaded annotations: {}".format(
                stat_downloaded_annotations))
            print("Imported new annotations: {}".format(
                stat_imported_annotations))

        return newest_update
Пример #30
0
from etl import ETL


if __name__ == '__main__':
    etl = ETL(url='mongodb://localhost:27017/', db_name='sbp')
    etl.run()