示例#1
0
文件: deepl.py 项目: u-jung/atom-dm
    def number_to_translate(self,
                            target_lang,
                            from_lang="de",
                            data_type="archival description"):
        """
		returns the number of records still to translate
		"""
        dm = data_manager.DataManager()
        sql = ""
        if data_type == "archival description":
            sql = 'select count(id),culture from information_object_i18n where culture in ("' + from_lang.lower(
            ) + '","' + target_lang.lower() + '") group by culture;'

        if sql != "":
            r = dm.get_mysql(sql, False, False)
            if r:
                if len(r) == 2:
                    return abs(r[0][0] - r[1][0])
                elif len(r) == 1:
                    return r[0][0]
                else:
                    return 0
示例#2
0
class Kalliope(object):

    base_url = "http://kalliope-verbund.info/sru"
    dm = data_manager.DataManager()
    BLOCKED_FILE = "../data/blocked.txt"  # list of items which should not be retrieved
    BLOCKED = []
    AUTHORITIES = []
    AUTHORITIES_FILE = "../data/authorities.json"

    IGNORE_INSTITUTIONS = []

    def __init__(self):
        pass

    def __del__(self):
        pass

    def _open_blocked(self):
        if os.path.isfile(self.BLOCKED_FILE):
            self.BLOCKED = []
            with open(self.BLOCKED_FILE, 'r') as file:
                self.BLOCKED = file.read().split('\n')
            file.close()

    def _store_blocked(self):
        with open(self.BLOCKED_FILE, 'w') as file:
            file.write("\n".join(self.BLOCKED))
        file.close()

    def export(self, **kwargs):
        """
			retrieves data from the German www.deutsche-digitale-bibliothek.de.
			Sends them back to DataManager in batches of {counter} size
			"""
        counter = kwargs['counter']
        from_term = kwargs['from_term']
        export_item_family = []
        export_list = []
        self.open_authorities()
        self._open_blocked()
        print(kwargs)
        for search_term in self.dm.search_term_generator(**kwargs):
            print(search_term)
            #self._open_blocked()
            for match in self._search_generator(search_term[0]):
                print(len(export_item_family), len(export_list),
                      "<<<<<<<<<<<<<<<< LEN\n")
                print(len(self.dm.LEGACY_IDS), "<<<<<<<<<<<<<<<< LEGACY_IDS\n")
                ##pprint.pprint(match)
                d = self._get_content(match)

                if d:
                    #print(self.dm.LEGACY_IDS)
                    if self.dm.predict_item(d['title'] + " " +
                                            d['scopeAndContent'] + " " +
                                            d['nameAccessPoints'] + " " +
                                            d['arrangement']):
                        if d['legacyId'] not in self.dm.LEGACY_IDS and d[
                                'legacyId'] not in self.BLOCKED and not self.dm.is_in_atom(
                                    d['legacyId']):
                            self.dm._add_to_legacy_ids(d['legacyId'])
                            export_item_family.append(d)
                            print(len(export_item_family), "länge")
                            if d['parentId'] != "":
                                for parent in self._parent_generator(
                                        d['parentId']):
                                    #print("we have a parent")
                                    export_item_family.append(parent)
                        else:
                            print(d['legacyId'], " skipped")
                    else:
                        print("prediction failed")

                if len(export_item_family) > 0:
                    export_item_family = [
                        x for x in export_item_family if x is not None
                    ]
                    #pprint.pprint(export_item_family)
                    #export_item_family.sort(key=operator.itemgetter('apd_level_of_description'))
                    export_list.extend(export_item_family)
                    export_item_family = []

            if len(export_list) > counter:
                self.dm.store_out_files()
                self.store_authorities()
                self._store_blocked()
                print("Len of export list: ", len(export_list))
                yield export_list.copy()
                export_list = []
        yield export_list.copy()

    def _parent_generator(self, parentId):
        r = self.get(parentId, "id")
        #print(r)
        if r[1] not in ["0", ""]:
            #pprint.pprint(r)
            if isinstance(r[0],
                          list):  #some records of Kalliope are duplicates
                match = r[0][0]['srw:recordData']['mods']
            else:
                match = r[0]['srw:recordData']['mods']
            print("PARENT")
            #pprint.pprint(match)
            #print('-----')
            d = self._get_content(match)
            if d:
                if d['legacyId'] not in self.dm.LEGACY_IDS and d[
                        'legacyId'] not in self.BLOCKED and not self.dm.is_in_atom(
                            d['legacyId']):
                    self.dm._add_to_legacy_ids(d['legacyId'])
                    yield (d)
                    if d['parentId'] != "":
                        yield from self._parent_generator(d['parentId'])
                else:
                    print("Parent already registred")

    def _get_content(self, match):
        d = {}
        pprint.pprint(match)
        #match=match['srw:recordData']['mods']
        for fieldname in g.ARCHIVAL_DESCRIPTIONS_FIELDS:
            d[fieldname] = ""
        d['culture'] = g.CULTURE
        d['levelOfDescription'] = "Gliederung"

        d['legacyId'] = match['identifier']['#text'][
            match['identifier']['#text'].rfind("/") + 1:]
        d['descriptionIdentifier'] = d['legacyId']
        if 'relatedItem' in match:
            d['parentId'] = match['relatedItem']['identifier'][0]['#text']
            d['parentId'] = d['parentId'][d['parentId'].rfind("/") + 1:]
            d['arrangement'] = match['relatedItem']['titleInfo']['title']
        d['title'] = match['titleInfo']['title']
        if isinstance(match['typeOfResource'], dict):
            d['physicalCharacteristics'] = match['typeOfResource']['#text']
        else:
            d['physicalCharacteristics'] = match['typeOfResource']
        if '@manuscript' in match['typeOfResource']:
            if match['typeOfResource']['@manuscript'] == "yes":
                d['physicalCharacteristics'] += "|Manuscript"
                d['levelOfDescription'] = "Objekt"
        if '@collection' in match['typeOfResource']:
            if match['typeOfResource']['@collection'] == "yes":
                d['physicalCharacteristics'] += "|Collection"
                d['levelOfDescription'] = "Sammlung"
        if 'languageOfCataloging' in match['recordInfo']:
            if isinstance(match['recordInfo']['languageOfCataloging'], dict):
                language = match['recordInfo']['languageOfCataloging'][
                    'languageTerm']
            else:
                language = match['recordInfo']['languageOfCataloging']
            for lang_term in language:
                if '@type' in lang_term:
                    d['languageNote'] = lang_term['#text']
                if '@authority' in lang_term:
                    if lang_term['#text'] in g.LANGUAGES:
                        d['language'] = g.LANGUAGES[lang_term['#text']]
                        d['culture'] = d['language']
        if 'abstract' in match:
            if isinstance(match['abstract'], str):
                d['scopeAndContent'] = match['abstract']
            else:
                d['scopeAndContent'] = match['abstract']['#text']
        if 'physicalDescription' in match:
            d['extentAndmedium'] = match['physicalDescription']['extent']
        if 'originInfo' in match:
            if 'place' in match['originInfo']:
                d['placeAccessPoints'] = match['originInfo']['place'][
                    'placeTerm']['#text']
            if 'dateCreated' in match['originInfo']:
                if match['originInfo']['dateCreated']:
                    if isinstance(match['originInfo']['dateCreated'], dict):
                        d['eventDates'] = match['originInfo']['dateCreated'][
                            '#text']
                    else:
                        d['eventDates'] = match['originInfo']['dateCreated']
                    (d['eventStartDates'],
                     d['eventEndDates']) = self.dm.build_eventDates(
                         d['eventDates'])
                    if not self.dm._is_in_time(d['eventStartDates'],
                                               d['eventEndDates']):
                        self._add_blocked(d['legacyId'])
                        print("added to blocked")
                        return False

        if 'recordContentSource' in match['recordInfo']:
            if isinstance(match['recordInfo']['recordContentSource'], dict):
                d['repository'] = match['recordInfo']['recordContentSource'][
                    '#text']

            else:
                d['repository'] = match['recordInfo']['recordContentSource']
            d['repository'] = re.sub(r'<|>', '', d['repository'])
        else:
            print("record don't have a recordContentSource")
        if 'location' in match:
            if isinstance(match['location'], list):
                loc_list = match['location']
            else:
                loc_list = [match['location']]
            #print(loc_list)
            for loc in loc_list:
                #print(loc)
                if 'shelfLocator' in loc:
                    #print(loc)
                    d['physicalObjectLocation'] = loc['shelfLocator']
                if 'url' in loc:
                    if '#text' in loc['url']:
                        d['findingAids'] = loc['url']['#text']
                    else:
                        d['findingAids'] = loc['url']
        else:
            print("record don't have a location tag?")

        if 'name' in match:
            if isinstance(match['name'], list):
                name_list = match['name']
            else:
                name_list = [match['name']]
            #print("we have names")
            name_arr = []
            for n in name_list:
                name_arr.append(self.name_clean(n['namePart']))
                self._add_authority(n)
            print(name_arr)
            if name_arr:
                d['nameAccessPoints'] = "|".join(name_arr)

        #print(d,"here is d")
        return d

    def name_clean(self, name_str):
        if isinstance(name_str, str):
            name_str = re.sub(r'<|>', '', name_str)
            name_str = re.sub(r'\([^\)]*\)', '', name_str).strip(" ")
            arr = name_str.split(",")
            if len(arr) == 2:
                name_str = arr[1] + " " + arr[0]
            return name_str
        else:
            return ""

    def _add_blocked(self, legacyId):
        if legacyId not in self.BLOCKED:
            self.BLOCKED.append(legacyId)

    def _add_authority(self, d):
        #print("adding to autorithy",d)
        if '@valueURI' in d:
            d['value'] = d['@valueURI']
        elif 'id' in d:
            d['value'] = d['id']
        else:
            d['value'] = ""
        #print(d)
        if not next(
            (x for x in self.AUTHORITIES if x['value'] == d['value']), False):
            self.AUTHORITIES.append(d)

        #print(self.AUTHORITIES,"???")
        return

    def store_authorities(self):
        with open(self.AUTHORITIES_FILE, 'w') as file:
            file.write(
                json.dumps(self.AUTHORITIES,
                           sort_keys=True,
                           indent=4,
                           ensure_ascii=False))
        file.close()

    def open_authorities(self):
        if len(self.AUTHORITIES) == 0:
            if os.path.isfile(self.AUTHORITIES_FILE):
                with open(self.AUTHORITIES_FILE, 'r') as file:
                    self.AUTHORITIES = json.load(file)
                file.close()

    def _search_generator(self, search_term):
        """
		lookup for search results in DDB using the search terms from DataManager

		Parameter:
		search_term : a tupel ({search_term String},{search_term WD_Instance})
		"""
        search_log = ""
        print("entrering search_generator with ", search_term)
        try:
            #

            (r, number_of_results) = self.get(search_term)
            log = str(number_of_results
                      ) + "\t" + search_term + "\tKAL\t" + time.strftime(
                          "%Y-%m-%d %H:%M:%S") + "\n" + search_log
            print(log)  #
            self.dm.SEARCH_LOG = log + self.dm.SEARCH_LOG
            #print(r)
            for e in r:
                yield e['srw:recordData']['mods']

        except Exception as e:

            #self.store_id_list()
            print("Error:", sys.exc_info()[0])
            print(e)

    def get(self, search_term, method="fulltext"):
        try:
            #print(search_term)
            data = {}
            data['query'] = '"' + search_term + '"'
            data['version'] = "1.2"
            data['recordSchema'] = 'mods'
            if method == "fulltext":
                data['operation'] = "searchRetrieve"
            if method == "id":
                data['operation'] = "searchRetrieve"
                data['query'] = "ead.id=" + search_term

            url_values = urllib.parse.urlencode(data)
            full_url = self.base_url + "?" + url_values

            print(full_url)

            r = urllib.request.urlopen(full_url).read().decode("utf-8")
            if r:
                d = json.loads(json.dumps(xmltodict.parse(r)))

                return (d['srw:searchRetrieveResponse']['srw:records']
                        ['srw:record'],
                        d['srw:searchRetrieveResponse']['srw:numberOfRecords'])
            else:
                return ([], "0")

        except Exception as e:
            self.dm.store_out_files()
            print('Error! Code: {c}, Message, {m}'.format(c=type(e).__name__,
                                                          m=str(e)))
            return ("", "")
示例#3
0
文件: deepl.py 项目: u-jung/atom-dm
    def translate_information_objects(self, target_lang, from_lang="DE"):
        dm = data_manager.DataManager()
        done = []

        sql = 'select i.id,i.repository_id,io.title,io.scope_and_content,io.archival_history,io.culture from information_object i join information_object_i18n io on i.id=io.id ;'
        information_objects = dm.get_mysql(sql, False, False)

        notes = dm.get_mysql(sql, False, False)
        # create a corpus of existing translations
        print(
            "Creating a corpus of sentences which has been already translated")
        for e in information_objects:
            #print(len(done))
            if e[5].upper() == target_lang:
                de_item = next((x for x in information_objects
                                if x[0] == e[0] and x[5] == "de"), False)
                if de_item:
                    for i in range(2, 5):
                        if not (de_item[i] is None):
                            exist = next(
                                (x
                                 for x in done if x[1] and x[0] == de_item[i]),
                                False)
                            if not exist:
                                done.append((de_item[i], e[i]))
        for e in notes:
            if e[3] != g.A_LANGUAGE_NOTE_ID:

                if not (e[2] is None) and e[2].upper() == target_lang:
                    de_item = next(
                        (x for x in notes if x[0] == e[0] and x[2] == "de"),
                        False)
                    if de_item:
                        if not (de_item[1] is None):
                            exist = next(
                                (x
                                 for x in done if x[1] and x[0] == de_item[1]),
                                False)
                            if not exist:
                                done.append((de_item[1], e[1]))

        #print(len(done), done)
        ##a=input("STOP")
        # start iterating
        k = 0
        to_translate = self.number_to_translate(target_lang, from_lang)
        for e in information_objects:

            if e[1] == 124547:
                continue
            title = ""
            scope_and_content = ""
            archival_history = ""
            add_lang_note = False
            new_content = False
            if e[5].lower() == "de":
                if not (e[2] is None):
                    r = next((x for x in done if x[0] == e[2]), False)
                    if r:
                        title = so.escapeQuotes(r[1])
                        #print("known ", title[0:40])
                    else:
                        print("----> ", k, "/", to_translate)
                        print("unknown ", e[2][0:40])
                        title = so.escapeQuotes(
                            self.translate_once(e[2], target_lang))
                        if title is None:
                            title = ""
                        else:
                            new_content = True
                    if title != "":
                        done.append((e[2], title))
                if not (e[3] is None):
                    r = next((x for x in done if x[0] == e[3]), False)
                    if r:
                        scope_and_content = so.escapeQuotes(r[1])
                        print("known ", scope_and_content[0:40])
                    else:
                        print("unknown ", e[3][0:40])
                        scope_and_content = so.escapeQuotes(
                            self.translate_once(e[3], target_lang))
                        if scope_and_content is None:
                            scope_and_content = ""
                        else:
                            new_content = True
                    if scope_and_content != "":
                        done.append((e[3], scope_and_content))
                if not (e[4] is None):
                    r = next((x for x in done if x[0] == e[4]), False)
                    if r:
                        archival_history = so.escapeQuotes(r[1])
                        print("known ", archival_history[0:40])
                    else:
                        print("unknown ", e[4][0:40])
                        archival_history = so.escapeQuotes(
                            self.translate_once(e[4], target_lang))
                        if archival_history is None:
                            archival_history = ""
                        else:
                            new_content = True
                    if archival_history != "":
                        done.append((e[4], archival_history))

                if title + scope_and_content + archival_history == "":
                    continue

                if new_content:

                    r = next((x for x in information_objects
                              if x[0] == e[0] and x[5] == target_lang.lower()),
                             False)
                    if r:
                        sql = 'update information_object_i18n set title="' + title + '", scope_and_content="' + scope_and_content + '", archival_history="' + archival_history + '" where id=' + str(
                            e[0]) + ' and culture ="' + target_lang.lower(
                            ) + '";'
                    else:
                        sql = 'insert into information_object_i18n (id,title,scope_and_content,archival_history, culture) values (' + str(
                            e[0]
                        ) + ',"' + title + '","' + scope_and_content + '","' + archival_history + '","' + target_lang.lower(
                        ) + '");'
                    print(sql)
                    r = dm.get_mysql(sql, True, True)
                    k += 1
                    #if k>4000:
                    #break
                else:
                    continue
                add_lang_note = True
                individual_notes = [
                    x for x in notes if x[4] == e[0] and x[2] == "de"
                ]
                language_note_id = 0
                if individual_notes:

                    for note in individual_notes:
                        if note[3] != g.A_LANGUAGE_NOTE_ID:
                            r = next((x for x in done if x[0] == note[1]),
                                     False)
                            if r:
                                content_translated = r[1]
                            else:
                                content_translated = self.translate_once(
                                    note[1], target_lang)

                            r = next((x for x in notes if x[0] == note[0]
                                      and x[2] == target_lang.lower()), False)
                            if r:
                                if r[1] != content_translated:
                                    sql = 'update note_i18n set content="' + + '" where id=' + str(
                                        note[0]) + ';'
                                    r = dm.get_mysql(sql, True, True)
                            else:
                                sql = 'insert into note_i18n (content,id,culture) values ("' + content_translated + '",' + str(
                                    note[0]) + ',"' + target_lang.lower(
                                    ) + '");'
                                r = dm.get_mysql(sql, True, True)
                            if archival_history != "":
                                done.append((note[1], content_translated))
                        else:
                            language_note_id = note[0]

                # add language note
                if add_lang_note:
                    if target_lang == "EN":
                        message = "This description was automatically translated with the help of www.DeepL.com. Translation errors are possible. Please note that the document itself has not been translated."
                    else:
                        message = "Cette description a été automatiquement traduite à l'aide de www.DeepL.com. Des erreurs de traduction sont possibles. Veuillez noter que le document lui-même n'a pas été traduit."

                    if language_note_id == 0:
                        r = next(
                            (x for x in notes
                             if x[4] == e[0] and x[3] == g.A_LANGUAGE_NOTE_ID),
                            False)
                        if r:
                            language_note_id = r[0]
                        else:
                            sql = 'insert into note (object_id,type_id, source_culture) values (' + str(
                                e[0]) + ',' + str(
                                    g.A_LANGUAGE_NOTE_ID
                                ) + ',"' + target_lang.lower() + '");'
                            language_note_id = dm.get_mysql(sql, True, True)
                            sql = 'insert into note_i18n (content,id,culture) values ("deutsch",' + str(
                                language_note_id) + ',"de");'
                            r = dm.get_mysql(sql, True, True)

                    if language_note_id > 0:
                        r = next(
                            (x for x in notes
                             if x[4] == e[0] and x[2] == target_lang.lower()),
                            False)
                        if r:
                            sql = 'update note_i18n set content="' + message + '" where id=' + str(
                                language_note_id
                            ) + ' and culture="' + target_lang.lower() + '";'
                        else:
                            sql = 'insert into note_i18n (content,id,culture) values ("' + message + '",' + str(
                                language_note_id) + ',"' + target_lang.lower(
                                ) + '");'
                        #print(sql)
                        r = dm.get_mysql(sql, True, True)
示例#4
0
文件: scope.py 项目: u-jung/atom-dm
import urllib
import json
import sys
from lxml import html
import re

from atom.main import data_manager, g
#from http.cookiejar import CookieJar
from atom.helpers.helper import fileOps, listOps, stringOps, osOps
fo = fileOps()
lo = listOps()
so = stringOps()
oo = osOps()

dm = data_manager.DataManager()


class ScopeArchiv():
    """
	Lookup for data in a single record inside Archive Portal Europe
	"""
    RESULTS_DICT = {}
    MAPPINGS = (
        ('Scope and content', "scopeAndContent"),
        ("Records creator's history", "archivalHistory"),
        ("Source of acquisition", "acquisition"),
        ("Conditions governing access", "accessConditions"),
        ("Conditions governing reproduction", "reproductionConditions"),
        ("Other finding aids", "findingAids"),
        ("Existence and location of copies", "locationOfCopies"),
示例#5
0
文件: atom.py 项目: u-jung/atom-dm
def main(args):
	

	d_args={"ops":[]}
	#print (args)
	last_key="ops"
	for arg in args:
		kv=arg.split("=")
		if len(kv)==2:
			a=kv[1].split(",")
			d_args[kv[0].replace("-","_")]=kv[1] if len(a)==1 else a
			if kv[1].lower()=="false":
				d_args[kv[0].replace("-","_")]=False
			if kv[1].lower()=="true":
				d_args[kv[0].replace("-","_")]=True			
			
			last_key=kv[0].replace("-","_")
		else:
			if kv[0][0:1]=="-":
				d_args["ops"].append(kv[0])
			else:
				a=kv[0].split(",")
				if isinstance(d_args[last_key],list):
					if len(a)>1:
						d_args[last_key].extend(a)
					else:
						d_args[last_key].append(a[0])
				else:
					d_args[last_key]+=" " + a[0]
					
	
	default_args={"timespan":True, 'predict':True,'predefined':[]}
	d_args=fuo.get_args(d_args,default_args)
		
	print (d_args)		
			
	
	err=0
	if len(args)>1:
		
		if  "-i" in d_args["ops"]:
			if 'source' in d_args:
				dm=data_manager.DataManager()				
				if d_args['source'].lower() in ("ddb","fbn","kal","eadxml","ddbhtml","ape","nad","sca"):
					dm.imports(**d_args)
				else: 
					print ("unknown import source")

			else:
				err=1
		elif "-h" in d_args['ops']:
			f=open("README.md","r")
			helptext=f.read()
			f.close()
			print(helptext)
			
		elif   "-m" in d_args["ops"]:
			if 'action' in d_args:
				if d_args['action'] =="create-sitemap":
					dm=data_manager.DataManager()
					dm.create_sitemaps()
				if d_args['action'] == "publish":
					dm=data_manager.DataManager()
					dm.publish()
				if d_args['action'] =="save-database":
					os=helper.stringOps()
					pass
				if d_args['action'] =="create-location-index":
					print("creating location index")
					loc=location.Location()
					loc.create_location_index()
				if d_args['action']=="make-arrangements":
					dm=data_manager.DataManager()
					repository=""
					if 'repository' in d_args:
						repository=int(d_args['repository'])
					dm.fill_repository_id()
					dm.create_arrangements(repository)
					
				if d_args['action'] =="join-tmp-csv":
					dm=data_manager.DataManager()
					dm.join_csv()
				if d_args['action']=="fill-repository-id":
					dm=data_manager.DataManager()
					dm.fill_repository_id()						
				if d_args['action'] =="reduce-csv":
					dm=data_manager.DataManager()
					dm.reduce_csv(True)	
				if d_args['action'] =="sort-import":
					dm=data_manager.DataManager()
					dm.sort_import()
				if d_args['action'] =="merge-csv":
					dm=data_manager.DataManager()
					dm.merge_csv(args[3],args[4])					
				if d_args['action'] =="update-access-points-list":
					ap=data_manager.access_points()
					ap.update_access_points_list()
				if d_args['action'] =="normalize-name-access-points":
					ap=data_manager.access_points()
					ap.normalize_name_access_points()	
				if d_args['action'] =="add-wd-identifier2actor":
					ap=data_manager.access_points()
					ap.add_wd_identifier2actor()
				if d_args['action'] =="add-wd-identifier2term":
					ap=data_manager.access_points()
					ap.add_wd_identifier2term()
				if d_args['action'] =="normalize-other-access-points":
					ap=data_manager.access_points()
					ap.normalize_other_access_points()		
				if d_args['action'] =="find-name-access-points":
					ap=data_manager.access_points()
					ap.find_name_access_points(True)
				if d_args['action'] =="find-other-access-points":#old
					ap=data_manager.access_points()
					ap.find_other_access_points(True)	
				if d_args['action'] == "add-other-names":
					ap=data_manager.access_points()
					ap.add_other_names()
				if d_args['action'] =="translate-information-objects":
					tr=deepl.deepl()
					if d_args['lang'] in ("en","EN"):
						tr.translate_information_objects("EN")
					if d_args['lang'] in ("fr","FR"):
						tr.translate_information_objects("FR")	
				if d_args['action'] == 'change-taxonomy':
					taxonomy_from=int(d_args['from'])
					taxonomy_to=int(d_args['to'])
					ap=data_manager.access_points()
					ap.change_taxonomy(taxonomy_from, taxonomy_to)				
				
				if d_args['action'] =="add-creator":
					object_slug=d_args['oslug']
					actor_slug=d_args['aslug']
					dm=data_manager.DataManager()
					dm.add_creator(object_slug,actor_slug)
				if 	d_args['action'] =="replace":			
					dm=data_manager.DataManager()	
					search_term=input("search term ? ")
					replace_term=input ('replace_term ? ')	

					culture=input('culture ? ').lower()
					if culture=="":
						culture="de"
					fields=input('fields in information_object_i18n to search for (separated by comma) ? \n (Just Enter for "title, scope_and_content,archival_history)": ')
					if input("ignore case ? y/(n) ") in ( "yes","Yes","y","Y"):
						ignore_case=True
					else:
						ignore_case=False
					if input("words only ? y/(n) ")  in ( "yes","Yes","y","Y"):
						words_only=True
					else:
						words_only=False	
					q="So, you want to replace occurencies of '"+search_term+"' by '" + replace_term +"' in culture='"+culture+"' ?  y/(n)"
					if input(q) not in ("yYjJ"):
						sys.exit() 
					print ("\n\n\nPLEASE MAKE SURE YOU HAVE A BACKUP COPY OF YOUR DATABASE!\n\n\n")
					dm.replace(	search_term,replace_term,culture,fields, words_only,ignore_case)	
				if d_args['action'] =="find-access-points-in-atom":
					ap=data_manager.access_points()
					if 'last_revision' in d_args:
						last_revision=d_args['last_revision']
					else:
						last_revision=""		
					print("lr",last_revision)	
					if d_args['type'] in ("Wikidata", "wd","WD"):
						print("open Wikidata corpus")
						dm=data_manager.DataManager()

						ap.find_access_points_in_atom("wd",last_revision)	
					else:
						print ("open AtoM corpus")
						ap.find_access_points_in_atom("atom",last_revision)
					print ("normalize access points")
					ap.normalize_name_access_points()
					print ('normalize other access_points')
					ap.normalize_other_access_points()	
					#ap.find_other_access_points(True)
					print ('clean up lower relations')
					ap.clean_lower_relations()
					ap.rebuild_nested()
				if d_args['action'] =="index-wd-keywords":
					
					dm=data_manager.DataManager()
					dm._index_keywords()									
				if d_args['action'] =="create-keyword-list":
					dm=data_manager.DataManager()
					dm.create_keyword_list()									
					
		elif d_args['action']=="build-eventDates":
			dm=data_manager.DataManager()
			print("testing: build_EventDates")
			print(dm.build_eventDates(d_args['param']))
		elif args[1] in ('--create_access_points', "-a"):
			pass
		elif args[1] in ("--location", "-p"):
			pass
		elif args[1] in ("--linguistic", "-l"):
			pass
		elif args[1] in ("--keywords", "-k", "--index-keywords"):
			dm=data_manager.DataManager()	
			dm._index_keywords()
		elif args[1] in ("--build-upper-levels"):
			dm=data_manager.DataManager()	
			pa=dm.TMP_RESULTS_PATH+time.strftime("%Y%m%d_%H%M%S")
			pathlib.Path(pa).mkdir(parents=True, exist_ok=True) 
			l=dm.build_upper_levels(None,None,None)
			dm._post_import(l,pa)
			dm.write_csv(l,pa+"/import.csv","archival_description")
		elif args[1] in ("--clean", "-c"):
			dm=data_manager.DataManager()
			if len(args)>2:
				if args[2] in ("de","en","fr"):
					dm.strip_tags(args[2])
				else:
					dm.strip_tags("de")
			else:
				dm.strip_tags("de")
		elif args[1] in ("--ai", "-ai"):
			ki=ai.ai()
			if len(args)>2:
				if args[2] == "add_data":
					if len(args)>3:
						data=None
						index=None
						target=None
						for arg in args[3:]:
							if arg[0:2].lower()=="-d":
								data=arg[3:]
							if arg[0:2].lower()=="-t":
								target=arg[3:]
							if arg[0:2].lower()=="-i":
								index=arg[3:]	
						ki.add2data(data,target,index)	
				if args[2] == "add-data-from-file-list":
					
					if len(args)>3:
						ki.add2data_from_file_list(args[3])
					else:
						ki.add2data_from_file_list(None)	
				if args[2] =="add-data-from-atom":
					ki.add2data_from_atom()	
				if args[2] =="word2ix":
					ki.index_words()	
				if args[2] in ("create_sets","create-sets"):
					print("Creating files ", ki.TRAIN_DATA_TEXT_FILE , " and " , ki.TEST_DATA_TEXT_FILE , " with ratio " , ki.RATIO)
					ki.create_sets(ki.RATIO)
				if args[2] =="train":
					ki.training()
				if args[2] =="test":
					ki.testing()
				if args[2] =="predict":
					from atom.helpers import ai
					if len(args)>4:
						print(ki.predict(args[3]))
		elif args[1] in ("--help", "-h"):
			print(g.HELP_STRING)
		

	if err==0:
		return 0
	else:
		print ("Not enough arguments. See ./atom-dm -h or --help for more information")
		return 1
示例#6
0
class eadxml(object):
    #data_directory='data/'
    ns = ""
    dm = data_manager.DataManager()
    ead_data = []
    sourcefile = ""
    ns = ""
    source_str = ""
    root = {}
    namespaces = {}
    tree = {}
    tree = object()
    meta = {
        "repository": "",
        "levelOfDescription": "Fonds",
        "legacyId": "",
        "title": "",
        "parentId": ""
    }

    def __init__(self, sourcefile, source_str):

        self.sourcefile = sourcefile
        self.source_str = source_str
        if os.path.isfile(self.sourcefile):
            try:
                #while True:
                self.tree = etree.parse(self.sourcefile)

                self.root = self.tree.getroot()
                print("File ", sourcefile, " found ")
                self.namespaces = dict([
                    node for _, node in ET.iterparse(sourcefile,
                                                     events=['start-ns'])
                ])
                if "" in self.namespaces:
                    self.ns = "{" + self.namespaces[''] + "}"
                else:
                    self.ns = ""

            #if True:
            except:
                print(" Couldn't open source File ", self.sourcefile)

    """			
	def __init__2(self, sourcefile, source_str):

		self.sourcefile=sourcefile
		self.source_str=source_str
		if os.path.isfile(self.sourcefile):
			try:
				self.tree = ET.iterparse(sourcefile, events=('end', ))

				self.root = etree.parse(sourcefile)
				print("File ", sourcefile, " found ")
			except:
				print("Couldn't open source File ", self.sourcefile)

			self.namespaces=dict([node for _,node in ET.iterparse(sourcefile, events=['start-ns'])])
			if "" in self.namespaces:
				self.ns="{"+self.namespaces['']+"}"
			else:
				self.ns=""
		else:
			print("Couldn't open source File ")
	
	"""

    #def export (self,counter, from_term):
    def export(self, **kwargs):
        yield self.transform()

    """
	def build_era_facet1(self,era):
		# spliting the era into seperate years
		#obsolete ?
		era_list=era.split('/')

		if len(era_list)==1:
			return self.left(era_list[0],4)
			
		else:
			years=[]
			for x in range(int(self.left(era_list[0],4)),int(self.left(era_list[1],4))+1):
				years.append(str(x))
				#print years
			return '*['+ '##'.join(years) + ']*'
	
	"""

    def left(s, amount):
        return s[:amount]

    """	

	def export2(self,filepath,source_str):
		# exportes a dict list
		l=self.transform(source_str)
		return l
	"""

    def transform(self):
        for child in self.root:
            #print(child.tag)
            if child.tag == self.ns + "eadheader":
                for head in child:
                    if head.tag == self.ns + "eadid":
                        self.meta["legacyId"] = head.text
                        self.meta['descriptionIdentifier'] = head.text
            if child.tag == self.ns + "archdesc":
                for sub_child in child:
                    if sub_child.tag == self.ns + "bioghist":
                        self.meta['archivalHistory'] = self.cleanHtml(
                            str(etree.tostring(sub_child, encoding='unicode')))
                    if sub_child.tag == self.ns + "scopecontent":
                        self.meta['scopeAndContent'] = self.cleanHtml(
                            str(etree.tostring(sub_child, encoding='unicode')))
                    if sub_child.tag == self.ns + "prefercite":
                        self.meta['prefercite'] = sub_child.text
                    if sub_child.tag == self.ns + "did":
                        for did in sub_child:
                            if did.tag == self.ns + 'unittitle':
                                self.meta['title'] = did.text
                            if did.tag == self.ns + "unitid":
                                if 'call number' in did.attrib:
                                    self.meta['identifier'] = did.text
                                elif 'type' in did.attrib:
                                    if did.attrib['type'] == 'call number':
                                        self.meta['identifier'] = did.text
                                else:
                                    self.meta['identifier'] = did.text
                            if did.tag == self.ns + "unitdate":
                                self.meta['eventDates'] = did.text
                                self.meta[
                                    'eventStartDates'] = self.dm.build_eventDates(
                                        self.meta['eventDates'])[0]
                                self.meta[
                                    'eventEndDates'] = self.dm.build_eventDates(
                                        self.meta['eventDates'])[1]
                            if did.tag == self.ns + "repository":

                                for rep in did:
                                    if rep.tag == self.ns + "corpname":

                                        if rep.tag == self.ns + "corpname":
                                            self.meta['repository'] = rep.text
                                        if rep.tag == self.ns + "extref":
                                            sel.meta['findingAids'] = rep.attrib[
                                                '{http://www.w3.org/1999/xlink}href']
                            if did.tag == self.ns + "abstract":
                                if 'scopeAndContent' in self.meta:
                                    self.meta['scopeAnContent'] += did.text
                                else:
                                    self.meta['scopeAnContent'] = did.text
                            if did.tag == self.ns + 'odd':
                                for p in did:
                                    if p.tag == 'p':
                                        self.meta['title'] += " - " + p.text

                    if sub_child.tag == self.ns + "dsc":
                        self.meta['arrangement'] = self.meta['title']
                        self.get_c(sub_child, None, None,
                                   self.meta['legacyId'],
                                   self.meta['repository'], self.meta['title'])
                        #print(json.dumps(self.ead_data, indent=2))
        if not ('culture' in self.meta):
            self.meta['culture'] = g.CULTURE

        self.ead_data.append(self.meta.copy())
        #self.ead_data=self.ead_data[::-1]
        #print(json.dumps(self.ead_data, indent=2))
        #self.ead_data[len(self.ead_data)-1].update(self.ead_data[0])
        #self.ead_data.pop(0)x
        """
		if self.ead_data[1]['parentId']=="":
			self.ead_data[1]['parentId']=self.ead_data[0]['parentId']
			self.ead_data.pop(0)
		"""
        self.ead_data.insert(0, self.ead_data.pop(len(self.ead_data) - 1))
        i = 0
        empty = None
        for i, e in enumerate(self.ead_data):
            if e['legacyId'] == e['parentId']:
                e['parentId'] = ""
            test_str = ""
            for k, v in e.items():
                if k in ['legacyId', 'repository', 'levelOfDescription']:
                    continue
                test_str += str(v)
            if test_str.strip(" ") == "":
                empty = i
        if empty is None:
            pass
        else:
            self.ead_data.pop(empty)

        #print("-----------------")
        #print(json.dumps(self.ead_data, indent=2))
        return self.ead_data

    def get_c(self, elem, levelOfDescription, l_id, p_id, p_arrangement,
              p_title):

        d = {"title": "", "scopeAndContent": "", 'eventActors': ''}

        if levelOfDescription:
            d['levelOfDescription'] = levelOfDescription
        if l_id:
            d['legacyId'] = l_id
        else:
            d['legacyId'] = ""
        if p_id:
            d['parentId'] = p_id
        else:
            d['parentId'] = ""

        for child in elem:

            if child.tag == self.ns + 'did':

                for did in child:
                    if did.tag == self.ns + 'unittitle':
                        d['title'] = did.text
                        if p_arrangement == "":
                            d['arrangement'] = d['title']
                        else:
                            if d['title'] != "":
                                d['arrangement'] = p_arrangement + " >> " + p_title
                            else:
                                d['arrangement'] = p_arrangement

                    if did.tag == self.ns + 'unitid':

                        if 'type' in did.attrib:
                            if did.attrib['type'] == "call number":
                                d['identifier'] = did.text
                            if did.attrib['type'] == 'file reference':
                                d['alternativeIdentifiers'] = did.text
                                d['alternativeIdentifierLabels'] = 'Former call number'
                        else:
                            d['identifier'] = did.text
                    if did.tag == self.ns + 'origination':
                        if 'eventActors' not in d:
                            d['eventActors'] = did.text + "|"
                        else:
                            #d['eventActors']='|'.join(d['eventActors'].split['|'] + [did.text] )
                            d['eventActors'] += did.text + "|"
                    if did.tag == self.ns + "unitdate":
                        d['eventDates'] = did.text
                        d['eventStartDates'] = self.dm.build_eventDates(
                            d['eventDates'])[0]
                        d['eventEndDates'] = self.dm.build_eventDates(
                            d['eventDates'])[1]
                    if did.tag == self.ns + "physdesc":
                        for phd in did:
                            if phd.tag == self.ns + "genreform":
                                d['physicalCharacteristics'] = phd.text
                            if phd.tag == self.ns + "extent":
                                d["extendAndMedium"] = phd.text
                    if did.tag == self.ns + "langmaterial":
                        for lmt in did:
                            if lmt.tag == self.ns + "language":
                                d['language'] = self.dm.iso639_3to2(
                                    lmt.attrib['langcode'])
                                d['culture'] = d['language']
                                d['languageOfDescription'] = d['language']
                                d['script'] = lmt.attrib['scriptcode']
                                d['languageNote'] = lmt.text

                    if did.tag == self.ns + "abstract":
                        if 'scopeAndContent' in d:
                            d['scopeAndContent'] += did.text
                        else:
                            d['scopeAndContent'] = did.text
                    if did.tag == self.ns + 'odd':
                        for p in did:
                            if p.tag == 'p':
                                d['title'] += " - " + p.text

            if child.tag == self.ns + "scopecontent":
                #d['scopeAndContent']=" ".join([x.text for x in child ])
                d['scopeAndContent'] += self.cleanHtml(
                    str(etree.tostring(child, encoding='unicode'))) + " \n"
            if child.tag == self.ns + "relatedmaterial":
                d['relatedUnitsOfDescription'] = self.cleanHtml(
                    str(etree.tostring(child, encoding='unicode')))

            if child.tag == self.ns + 'otherfindaid':
                success = False
                for p in child:
                    if p.tag == self.ns + 'p':
                        success = True
                        for extref in p:
                            if extref.tag == self.ns + "extref":
                                d['findingAids'] = extref.attrib[
                                    '{http://www.w3.org/1999/xlink}href']
                if not success:
                    for extref in child:
                        if extref.tag == self.ns + "extref":
                            d['findingAids'] = extref.attrib[
                                '{http://www.w3.org/1999/xlink}href']

            if child.tag == self.ns + "c":

                if 'level' in child.attrib:
                    levelOfDescription = child.attrib['level']
                if 'id' in child.attrib:
                    l_id = child.attrib['id']

                if d['legacyId'] == "":
                    self.get_c(child, levelOfDescription, l_id, d['parentId'],
                               p_arrangement, d['title'])
                else:
                    self.get_c(child, levelOfDescription, l_id, d['legacyId'],
                               d['arrangement'], d['title'])

        d['repository'] = self.meta['repository']
        if not ('culture' in d):
            d['culture'] = g.CULTURE
        if d['legacyId'] != "":
            if not d['title']:
                d['title'] = ""
            d['eventActors'] = d['eventActors'].strip('|')
            self.ead_data.append(d.copy())

    def cleanHtml(self, text):
        linefeeds = [
            "</p>", "<lb/>", "<br/>", "<br />", "</adressline>", "</head>"
        ]
        text = so.replaceChars(text, linefeeds, " \n")
        text = so.stripHtml(text, " ")
        text = text.replace("  ", " ").replace("  ", " ").replace(
            "  ", " ").replace("  ", " ").replace("  ", " ").replace(
                "  ", " ").replace("  ", " ").replace("  ", " ")
        #text=text.replace("\n\n","\n").replace("\n\n","\n").replace("\n\n","\n").replace("\n\n","\n")
        text = so.dedupSpaces(text)
        text = so.replaceChars(text, ["\n\n", "\n \n"], "\n")
        text = text.strip("\n")
        return text
示例#7
0
 def __init__(self):
     self.dm = data_manager.DataManager()
示例#8
0
class ai(object):
	
	TARGET_THEMATIC=1
	TARGET_NON_THEMATIC=0	
	FIELDS=["title","scopeAndContent","arrangement"]
	TRAIN_DATA=[]
	TEST_DATA=[]
	TRAIN_DATA_FILE="atom/data/train.json"
	TEST_DATA_FILE="atom/data/test.json"
	TRAIN_DATA_TEXT_FILE="atom/data/train.txt"
	TEST_DATA_TEXT_FILE="atom/data/test.txt"
	TRAIN_DATA_STR_FILE="atom/data/train_raw.txt"
	STOPPWORD_FILE="atom/data/stopp.txt"
	RATIO=0.7
	STOPWORTE=[]
	STOPWORTE_FILE="atom/data/stopworte.txt"
	TRAIN_FILE_LIST_FILE="atom/data/ai_train_files.txt"
	WORD2IX={}
	WORD2IX_FILE="atom/data/word2ix.json"
	WORD_STAT=[]
	WORD_STAT_FILE='atom/data/word_stats.txt'
	LEARNING_RATE=-0.005
	BOW_MODEL_FILE='atom/data/bow.pt'
	MODEL=object()
	BOW=[]
	EPOCHS=20
	
	dm=data_manager.DataManager()
	
	def __init__(self):
		self.TRAIN_DATA=[]
		self.TEST_DATA=[]
		
	
	def add2data_from_file_list(self,list_file=None):
		print("start")
		
		if list_file:
			file_list=fo.load_data(list_file,None,True)
		else:
			file_list=fo.load_data(self.TRAIN_FILE_LIST_FILE,None,True)
		#print(file_list)
		file_list=[x.split(",") for x in file_list]
		for fname,target in file_list:
			if fname[0:1]=="#":
				continue
			print("-----------------")
			print("file: ", fname, "  target:", target)
			self.add2data(fname,target,[0,1])
		print("Finish")
	
	def add2data_from_atom(self):
		sql="select concat(IFNULL(title, ''), IFNULL(arrangement, ''),IFNULL(scope_and_content, '')) as text from information_object_i18n ii join information_object i on ii.id=i.id where  i.level_of_description_id=461 and culture ='de';"
		l=self.dm.get_mysql(sql,False)
		print(len(l), " records found in AtoM database")
		new_l=[str(x[0]) for x in l]
		self.add2data(new_l,"1")
		
	
	def add2data(self,data,target,index_arr=[]):
		
		"""
		Adds content and target to a batch of data and add them to the data.
		Shuffels test_data and train_data once again
		"""
		if isinstance(data,str):
			if os.path.isfile(data) and data[-4:].lower()==".xml":
				l=self.addEADXML2data(data,target)
			else:
				l=self.prepare_data(data,index_arr)
		else:
			l=self.prepare_data(data,index_arr)
		print(len(l),"---")
		l=self.add_target(l,target)
		if len(l)>0:
			print(len(l), " records collected. Having target =", l[0][1] )
		else:
			print ("No records collected")
		l=self.clean_list_data(l,True,True)
		print(len(l), "Records cleaned.")
		#print(l)
		#return
		#if self.TRAIN_DATA is None:
		#	self.TRAIN_DATA	= fo.load_data_once(self.TRAIN_DATA,self.TRAIN_DATA_FILE)
		#print("Train data loaded. ",self.stat(self.TRAIN_DATA))
		#if self.TEST_DATA is None:
		#	self.TEST_DATA=fo.load_data_once(self.TEST_DATA,self.TEST_DATA_FILE)
		#print("Test data loaded. ", self.stat(self.TEST_DATA))
		#self.TRAIN_DATA = self.TRAIN_DATA.copy() + l.copy()
		#print("Data has been merged", self.TRAIN_DATA[len(self.TRAIN_DATA)-1][1])
		#self.stat(self.TRAIN_DATA)
		#self.store_data(self.RATIO)
		self.add_data2_store(l,self.RATIO)
		print("Data has been (shuffled and) stored")

	def prepare_data(self,data, index_arr=[]):
		"""
		Gets a EAD-CSV file or a list (of strings, list or tuples), extends
		\n the target to each element and returns a list of tupels
		data - the list or filename
		target - the target value for the nn
		index - the index of the values to use if list element is list or tuple
		"""
		rlist=[]

		if isinstance(data,list):
			l=data
		elif isinstance(data,str):		
			if os.path.isfile(data):
				l=fo.load_data(data)
				#print(len(l)," data len file", type(l), index)
				if isinstance(l,list):
					if isinstance(l[0],dict):
						l=self.read_data_from_csv(l,self.FIELDS,{"levelOfDescription":"File"})
					elif isinstance(l[0],list) or isinstance(l[0],tuple) and len(index)>0:
						#new_l=[x[index] for x in l]
						new_l=[x[index_arr[0]]+" "+x[index_arr[1]] for x in l]
						l=new_l
					elif isinstance(l[0],str):
						l=l
				#print(len(l)," data len file", type(l), index)	
			else:
				l=data.split("\n")
		return l.copy()
	
	def add_target(self,l,target):
		print("target",target)
		rlist=[]
		print(type(l[0]))
		for e in l:
			if isinstance(e,str):
				rlist.append((e,target))
			elif isinstance(e,list) or isinstance(e,tuple):
				if len(e)>1:
					rlist.append((e[0],e[1]))
				elif len(e)==1:
					rlist.append((e[0],target))
		
		return rlist.copy()
	
	def clean_list_data(self,l, stemm=True, stopp=True):
		if stopp:
			stopw=fo.load_data(self.STOPPWORD_FILE,None,True)
		rlist=[]
		l=lo.dedup(l)
		i=0
		total=len(l)
		current=0
		for e in l:
			item=e[0]
			item=so.replaceChars(item,so.SATZZEICHEN,"")
			item=so.replaceChars(item,so.LQGQ,"")
			item=so.replaceChars(item,so.ANFUHRUNGSZEICHEN,"")
			item=so.replaceChars(item,so.STRICHE," ")
			item=so.replaceChars(item,['\n']," ")
			item=so.replaceChars(item,[']n'],' ')
			item=so.replaceChars(item,['\t',"b'",'¤',"‧"],' ')
			item=so.replaceChars(item,"'°%§[]*"," ")
			item=so.replaceChars(item,"0123456789","")
			item=so.replaceChars(item,['xc3xa4','xc3x84'],'a')
			item=so.replaceChars(item,['xc3xb6','xc3x96'],'o')
			item=so.replaceChars(item,['xc3xbc','xc3x9c'],'u')
			item=so.replaceChars(item,['xc3x9f'],'ss')
			
			item=so.dedupSpaces(item)
			if stemm:
				item=stemmer.stem(item)
			if stopp:
				item=" ".join([x for x in item.split(" ") if x not in stopw])
			
			if int(i/total*100)>current:
				current=int(i/total*100)
				sys.stdout.write("\033[F")
				print(current+1, "% ")
				
			i+=1
			if item!="" and item not in [str(x) for x in range(0,10)]:
				rlist.append((item,e[1]))	
		return rlist
			
	def clean_text(self,item,stemm=True, stopp=True):
		if stopp:
			stopw=fo.load_data(self.STOPPWORD_FILE,None,True)
		item=so.replaceChars(item,so.SATZZEICHEN,"")
		item=so.replaceChars(item,so.LQGQ,"")
		item=so.replaceChars(item,so.ANFUHRUNGSZEICHEN,"")
		item=so.replaceChars(item,so.STRICHE," ")
		item=so.replaceChars(item,['\n']," ")
		item=so.replaceChars(item,[']n'],' ')
		item=so.replaceChars(item,['\t',"b'",'¤',"‧"],' ')
		item=so.replaceChars(item,"'°%§[]*"," ")
		item=so.replaceChars(item,"0123456789","")
		item=so.replaceChars(item,['xc3xa4','xc3x84'],'a')
		item=so.replaceChars(item,['xc3xb6','xc3x96'],'o')
		item=so.replaceChars(item,['xc3xbc','xc3x9c'],'u')
		item=so.replaceChars(item,['xc3x9f'],'ss')
		item=so.dedupSpaces(item)
		if stemm:
			item=stemmer.stem(item)
		if stopp:
			item=" ".join([x for x in item.split(" ") if x not in stopw])
		return item
	
	def addEADXML2data(self,sourcefile,target):
		eadx=eadxml.eadxml(sourcefile,"")
		l=eadx.transform()
		fo.save_data(l,"/tmp/test.json")
		rlist=[]
		for e in l:
			if "levelOfDescription" in e:
				item=""
				if "title" in e:
					
					item=e["title"]+" "
				if "scopeAndContent" in e:
					item+=e["scopeAndContent"] +" "
				if "arrangement" in e:
					item+=e["arrangement"]
				if item != "":
					rlist.append(item)
		rlist=[(x,target) for x in rlist]
		return rlist
		

	def stat(self,l):
		if l:
			t0=[x for x in l if int(x[1])==0]
			t1=[x for x in l if int(x[1])==1]
			return("Len: ", len(l)," T 0 = ", len(t0), " T 1 = ", len(t1))
		else:
			return("list to count does not exist")

	def read_data_from_csv(self,sourcefile,fields, restrictions):
		"""
		Opens returns a list where elements are string concat of requiered fields
		"""
		
		rlist=[]
		

		if l:
			for e in l:
				item=""
				if restrictions:
					for k,v in enumerate(restrictions):
						if item[k]!=v:
							continue
				for k in e.keys():
					if k in fields:
						item += e[k]+" "
				rlist.append(item)
			return rlist
		else:
			print("Could see list of dicts")
			return []
				
						
		
	"""
	def open_train_data(self):
		self.TRAIN_DATA=fileOps.load_data_once(self.TRAIN_DATA,self.TRAIN_DATA_FILE)	
	
	def open_test_data(self):
		self.TEST_DATA=fileOps.load_data_once(self.TEST_DATA,self.TEST_DATA_FILE)
	"""
	
	def store_data_json(self, ratio):
		
		print(len(self.TRAIN_DATA), "Train len")
		new_data=[]
		new_train_data=[]
		new_test_data=[]
		if self.TEST_DATA:
			tt_data=self.TEST_DATA+self.TRAIN_DATA
		else:
			tt_data=self.TRAIN_DATA
		tt_data=lo.dedup(tt_data)
		count=len(tt_data)
		for i in range(0,len(tt_data)):
			item=random.choice(tt_data)
			if i>(count*ratio):
				new_test_data.append(item)
			else:
				new_train_data.append(item)
			tt_data.remove(item)
		
		#fo.save_data(new_test_data, self.TEST_DATA_FILE)
		#fo.save_data(new_train_data, self.TRAIN_DATA_FILE)
		
	def add_data2_store(self,data,ration):
		train_str=fo.load_data(self.TRAIN_DATA_STR_FILE,None,False)
		print(len(train_str)," Train String Len")
		add_str=""
		dup_c=0
		for e in data:
			add_str+=e[0]+"|"+e[1]+"\n"
			"""
			if train_str.find(e[0])<0 and add_str.find(e[0])<0:
			#if True:
				add_str+=e[0]+"|"+e[1]+"\n"
			else:
				dup_c+=1
			"""
		data=[]
		#print(dup_c, " duplicates removed")
		train_str=train_str+add_str
		fo.save_data(train_str,self.TRAIN_DATA_STR_FILE)
		if os.path.isfile(self.TRAIN_DATA_STR_FILE):
			subprocess.Popen(["cp",self.TRAIN_DATA_STR_FILE,self.TRAIN_DATA_STR_FILE + ".bak"  ] )
		#subprocess.Popen("perl -i.bak -ne 'print if ! $x{$_}++' "+ self.TRAIN_DATA_STR_FILE)
		return
		
	def index_words(self):
		i=0
		print(" ")
		with open(self.TRAIN_DATA_STR_FILE) as f:
			for line in f:
				i+=1
				sys.stdout.write("\033[F")
				print(i)
				items=line.split("|")
				text=items[0].split(" ")
				for k,word in enumerate(text):
					if not word.isnumeric():
						if word not in self.WORD2IX:
							self.WORD2IX[word] = len(self.WORD2IX)
							"""
							if len(items)>0:
								if len(items[1])>0:
									if items[1][0]=="0":
										self.WORD_STAT.append([word,len(self.WORD2IX),1,0])
									else:
										self.WORD_STAT.append([word,len(self.WORD2IX),0,1])
							"""
						"""
						else:
							e=next((x for x in self.WORD_STAT if x[0]== word),False)
							if e:
								if len(items)>0:
									if len(items[1])>0:
										if items[1][0]=="0":
											e[2]+=1
										else:
											e[3]+=1
						"""
						"""
						if k< len(text)-1:
							j=k +1
							n2gram=word+text[j]
							if n2gram not in self.WORD2IX:
								#print(n2gram)
								self.WORD2IX[n2gram] = len(self.WORD2IX)
						"""
		#print(self.WORD2IX, len(self.WORD2IX))
		fo.save_data(self.WORD2IX,self.WORD2IX_FILE)
		"""
		text=""
		for e in self.WORD_STAT:
			text+="|".join([str(x) for x in e])+"\n"
		fo.save_data(text,self.WORD_STAT_FILE)
		"""
		return len(self.WORD2IX)
	
	
	def word_stats(self):
		frequencies=[]
		if len(self.WORD_STAT)==0:
			l=fo.load_data(self.WORD_STAT_FILE)
			
	
	def create_sets(self,ratio):
		#self.TRAIN_DATA=[]
		#self.TEST_DATA=[]
		train_data=""
		test_data=""
		f=open(self.TRAIN_DATA_STR_FILE ,"r")
		lines=f.readlines()
		count=len(lines)
		err=0
		print("")
		for i in range(0,len(lines)):
			sys.stdout.write("\033[F")
			print(i)
			line=random.choice(lines)
			item=line.strip("\n").split("|")
			mod_text=lo.remove_duplicate_followers(item[0])
			if len(item)>1:
				if i>(count*ratio):
					test_data+=mod_text+"|"+item[1]+"\n"
				else:
					train_data+=mod_text+"|"+item[1]+"\n"

			lines.remove(line)
		
		fo.save_data(test_data, self.TEST_DATA_TEXT_FILE)
		test_data=""
		fo.save_data(train_data, self.TRAIN_DATA_TEXT_FILE)
				
	
	

		
		




	def catFromOutput(self,out):
		_,i = out.data.topk(1)
		return i

	#def charToIndex(self,char):
		#return letters.find()
		
	"""
	def wordToTensor(self,word):
		ret=torch.zeros(1,len(self.WORD2IX))  #ret.size= (1,len(letters))
		ret=[0][self.WORD2IX[word]]=1
		return ret
	"""
	""""
	def textToTensor(self,text):
		ret=torch.zeros(len(text),1,len(self.WORD2IX))
		for i,word in enumerate(text):
			ret[i][0][self.WORD2IX[word]]=1
		return ret
	"""	

		
	def getTrainData(self,test=False):
		if test:
			file_name=self.TEST_DATA_TEXT_FILE
		else:
			file_name=self.TRAIN_DATA_TEXT_FILE	
		with open(file_name,"r") as f:
			for line in f:
				#print(line)
				text=line[:-2]
				cat=line[-2:-1]
				#print(cat)
				#print(text)
				if cat in ("0","1"):
					cat=int(cat)
					#text_tensor=Variable(self.textToTensor(text))
					#cat_tensor=Variable(torch.LongTensor([cat]))
					
					#yield cat,text,cat_tensor,text_tensor
					yield text.split(),cat
		
	def make_bow_vector(self,sentence):
		# create a vector of zeros of vocab size = len(word_to_idx)
		vec = torch.zeros(len(self.WORD2IX))
		for i,word in enumerate(sentence):
			if word not in self.WORD2IX:
				#raise ValueError('houston we have a problem')
				pass
			else:
				vec[self.WORD2IX[word]]+=1
			"""
			if i+1< len(sentence):
				if word+sentence[i] in self.WORD2IX:
					vec[self.WORD2IX[word+" "+sentence[i]]]+=1
			"""
		return vec.view(1, -1)

	def make_target(self,value):
		return torch.LongTensor([value])
	
	def training(self):
		if len(self.WORD2IX)==0:
			self.WORD2IX=fo.load_data(self.WORD2IX_FILE)
		if isinstance(self.BOW,list):	
			self.BOW = BOWClassifier(2, len(self.WORD2IX))
		if os.path.isfile(self.BOW_MODEL_FILE):
			self.BOW=torch.load(self.BOW_MODEL_FILE	)
			self.BOW.eval()
		#bow_vector = torch.autograd.Variable(self.make_bow_vector(sample_data, self.WORD2IX))
		#logprobs = bow(bow_vector)
		# define a loss function and an optimizer
		loss_function = nn.NLLLoss()
		#opt = torch.optim.SGD(bow.parameters(), lr = 0.05)
		opt = torch.optim.SGD(self.BOW.parameters(), lr = 0.05)

		# the training loop
		print(" T:" ,strftime("%Y-%m-%d %H:%M:%S", gmtime()) )
		for epoch in range(self.EPOCHS):
			for instance, label in self.getTrainData():
				# get the training data
				self.BOW.zero_grad()
				bow_vec = Variable(self.make_bow_vector(instance))
				label = Variable(self.make_target(label))
				probs = self.BOW(bow_vec) # forward pass
				loss = loss_function(probs, label)
				loss.backward()
				#sys.stdout.write("\033[F")
				#print('CURRENT LOSS: {}'.format(loss.data))
				opt.step()
			#sys.stdout.write("\033[F")
			print('Epoch:', epoch,'  CURRENT LOSS: {}'.format(loss.data), " T:" ,strftime("%Y-%m-%d %H:%M:%S", gmtime()) )
			torch.save(self.BOW,self.BOW_MODEL_FILE)	
	
	def testing(self):	
		if len(self.WORD2IX)==0:
			self.WORD2IX=fo.load_data(self.WORD2IX_FILE)
		if isinstance(self.BOW,list):	
			self.BOW = BOWClassifier(2, len(self.WORD2IX))
			self.BOW=torch.load(self.BOW_MODEL_FILE)
			self.BOW.eval()
		i=0
		c=0
		print("")
		for instance, label in self.getTrainData(True):
			i+=1
			bow_vec = Variable(self.make_bow_vector(instance))
			logprobs = self.BOW(bow_vec)
			#print(logprobs)
			
			pred = np.argmax(logprobs.data.numpy())
			if pred==int(label):
				c+=1
			else:
				#print(instance[0:60])
				pass
		sys.stdout.write("\033[F")
		print("i= ", i , " Correct: " , 100* c/i )
		
	
	def predict(self,text):
		if len(self.WORD2IX)==0:
			self.WORD2IX=fo.load_data(self.WORD2IX_FILE)
		if isinstance(self.BOW,list):	
			self.BOW = BOWClassifier(2, len(self.WORD2IX))
			self.BOW=torch.load('atom/data/bow.pt')
			self.BOW.eval()
		text=self.clean_text(text)
		bow_vec = Variable(self.make_bow_vector(text.split()))
		logprobs = self.BOW(bow_vec)
		pred = np.argmax(logprobs.data.numpy())
		return pred
示例#9
0
class access_points(object):

    ACCESS_POINTS_LIST = []
    ACCESS_POINTS_LIST_FILE = "atom/data/access_points.json"
    dm = data_manager.DataManager()
    SUBJECT_AP = 35
    PLACE_AP = 42
    GENRE_AP = 78

    def __init__(self):

        self.ACCESS_POINTS_LIST = fo.load_data(self.ACCESS_POINTS_LIST_FILE)
        self.update_access_points_list()

    def update_access_points_list(self):
        sql = 'select t.id, t.parent_id,ti.name,ti.culture, t.taxonomy_id from term_i18n ti join term t on ti.id=t.id  where t.taxonomy_id in (35,42,78) and ti.culture in ("en","fr","de") order by ti.name;'
        r = self.dm.get_mysql(sql, False)
        sql = 'select group_concat(name) as term from other_name o join other_name_i18n oi on o.id=oi.id where object_id=' + str(
            r[0][0]) + ' group by object_id;'
        #print(sql)
        alt_names = self.dm.get_mysql(sql, True)
        for e in r:
            item = next(
                (x for x in self.ACCESS_POINTS_LIST if x['id'] == e[0]), False)
            if item:
                item['culture_' + e[3]] = e[2]
                item['parent_id'] = e[1]
                item['taxonomy_id'] = e[4]

                if alt_names:
                    if 'inidcators' in e:
                        item['indicators'] = list(
                            set(item['indicators']).union(
                                set(alt_names[0].split(","))))
                    else:
                        item['indicators'] = alt_names[0].split(",")

            else:
                d = {}
                d['culture_' + e[3]] = e[2]
                d['parent_id'] = e[1]
                d['type'] = e[4]
                d['id'] = e[0]

                if alt_names:
                    d['indicators'] = alt_names[0].split(",")
                self.ACCESS_POINTS_LIST.append(d)
        fo.save_data(self.ACCESS_POINTS_LIST, self.ACCESS_POINTS_LIST_FILE)

    def lookup(self, text, culture="de", return_text=True):
        """
		Checks if one or mor access_points could be connected to a given text,
		If return_text==False, term_ids will be returned instead
		"""
        text = text.strip(" ")
        tmp = {SUBJECT_AP: [], PLACE_AP: [], GENRE_AP: []}
        placeAccessPoints = []
        genreAccessPoints = []
        for ap in self.ACCESS_POINTS_LIST:
            test = False
            if 'exclusions' in ap:
                for ex in ap['exclusions']:
                    pattern = re.compile(ex, re.IGNORECASE)
                    if re.search(p, text):
                        continue
            if 'culture_' + culture in ap:
                pattern = re.compile(ap['culture_' + culture], re.IGNORECASE)
                if re.search(p, text):
                    test = True

            if 'indicators' in ap:
                for ind in ap['indicators']:
                    pattern = re.compile(ind, re.IGNORECASE)
                    if re.search(p, text):
                        test = True
            if test:
                if return_text:
                    if 'culture_' + culture in ap:
                        tmp[ap['type']].append(ap['culture_' + culture])
                    elif 'culture_de' in ap:
                        tmp[ap['type']].append(ap['culture_de'])
                    else:
                        tmp[ap['type']].append(ap['id'])
                else:
                    tmp[ap['type']].append(ap['id'])
        return tmp