def make_sort_trinomial(self, trinomial): """ make a sortable trinomial """ tri_man = TrinomialManage() tri_parts = tri_man.parse_trinomial(trinomial) prepended_site = self.prepend_site_zeros(tri_parts['site'], 5) return str(tri_parts['state']) + str( tri_parts['county']) + prepended_site
def find_trinomials_in_metadata(self, xml): """ finds trimomials in metadata """ recs = xml.xpath('//oai:record', namespaces=self.namespaces) print('Number of records in XML: ' + str(len(recs))) for rec in recs: tri_dict = { 'rec_uri': None, 'title': None, 'citation_html': None, 'source_label': 'Index of Texas Archaeology: Open Access Gray Literature from the Lone Star State', 'source_uri': 'http://scholarworks.sfasu.edu/ita', 'request_url': self.request_url, 'trinomials' : [] } idents = rec.xpath('oai:metadata/oai_dc:dc/dc:identifier', namespaces=self.namespaces) rec_uri = None for ident_dom in idents: act_uri = ident_dom.text if rec_uri is None: rec_uri = act_uri if 'viewcontent.cgi' not in act_uri: rec_uri = act_uri tri_dict['rec_uri'] = rec_uri titles = rec.xpath('oai:metadata/oai_dc:dc/dc:title', namespaces=self.namespaces) for title_dom in titles: tri_dict['title'] = title_dom.text tri_dict['trinomials'] = self.extract_texas_trinomials_from_text(title_dom.text, tri_dict['trinomials']) descripts = rec.xpath('oai:metadata/oai_dc:dc/dc:description', namespaces=self.namespaces) for description_dom in descripts: tri_dict['trinomials'] = self.extract_texas_trinomials_from_text(description_dom.text, tri_dict['trinomials']) subjects = rec.xpath('oai:metadata/oai_dc:dc/dc:subject', namespaces=self.namespaces) for subject_dom in subjects: tri_dict['trinomials'] = self.extract_texas_trinomials_from_text(subject_dom.text, tri_dict['trinomials']) tri_dict['citation_html'] = self.make_citation_html(rec) print('-----------------------------------') print(tri_dict['citation_html']) print('Trinomials: ' + str(tri_dict['trinomials'])) print('-----------------------------------') if len(tri_dict['trinomials']) > 0: # we found trinomials! So add to our list of publications with trinomials self.trinomial_refs.append(tri_dict) for trinomial in tri_dict['trinomials']: if trinomial not in self.unique_trinomials: self.unique_trinomials.append(trinomial) # now add the trinomialstate and county code to the list # of unique state and county codes if it is new # fist, we parse the trinomial to find the state and county parts tri_m = TrinomialManage() tri_p = tri_m.parse_trinomial(trinomial) state_county = tri_p['state'] + tri_p['county'] if state_county not in self.unique_state_counties: # we have a new state and county code, so add it to the list self.unique_state_counties.append(state_county)
def validate_token_trinomial(self, token): """ checks to see if a token is a trinomial """ # make the token upper case, so all county codes are upper case, if this # token happens to be a trinomial token = token.upper() ok = False trinomial = None tri_m = TrinomialManage() try: tri_p = tri_m.parse_trinomial(token) except: # could not parse as a trinomial # so it's not a trinomial tri_p = None if isinstance(tri_p, dict): ok = True # make sure the county part is only letters tri_p['county'] = re.sub('[^a-zA-Z]+', '', tri_p['county']) # now validate different parts of the trinomial if len(tri_p['state']) < 1 or len(tri_p['state']) > 2: # the state code has the wrong length not 1 or 2 characters ok = False else: # state length is OK, but check if it is an integer state_is_int = True try: state_int = int(float(tri_p['state'])) state_is_int = True except: state_is_int = False if state_is_int is False: ok = False if isinstance(self.only_valid_state_id, str): # we need to validate by an allowed state code if tri_p['state'] != self.only_valid_state_id: # the state part of the trinomial is not # the allowed state id ok = False if len(tri_p['county']) != 2: # county part of trinomial is the wrong length ok = False if len(tri_p['site']) < 1: # can't have a blank site number ok = False else: # site number is not blank, but check if it is an integer site_is_int = True try: state_int = int(float(tri_p['state'])) state_is_int = True except: state_is_int = False if state_is_int is False: # not an integer so not a good trinomial ok = False if ok: # now put together the trinomial parts into a well formated full trinomial trinomial = tri_p['state'] + tri_p['county'] + tri_p['site'] return trinomial
def make_trinomial_instances_df(doc_dir): tri_man = TrinomialManage() tri_man.remove_prepended_zeros = True df = pd.DataFrame( columns=[ 'filename', 'pos_trinomial', 'state_num', 'region_abbr', 'site_number' ] ) i = 0 for subdir, dirs, files in os.walk(doc_dir): for file in files: if not file.endswith('.txt'): continue filepath = os.path.join(subdir, file) with open(filepath, 'r') as file_obj: content = file_obj.read() trinomials = re.findall(r'(\b([0-9]{1,2}[A-Z]{2,}[0-9]{1,})\b)', content) trinomials = set(trinomials) for t_tup in trinomials: t_tup = set(t_tup) for trinomial in t_tup: if trinomial.startswith('0'): # not a trinomial continue tri_parts = tri_man.parse_trinomial(trinomial) state = int(tri_parts['state']) if state < 1 or state > 50: # not a state, skip continue df.loc[i] = [ file, trinomial, state, tri_parts['county'], tri_parts['site'] ] i += 1 print('[{}] Found {} in {} ({}, {}, {})'.format( i, trinomial, file, state, tri_parts['county'], tri_parts['site'], ) ) return df
def make_aux_trinomial_list(self, trinomial): """ makes a list of auxiliary, non standard trinomials """ aux_tris = [] # get a dictionary for the different parts of the trinomial tri_man = TrinomialManage() tri_parts = tri_man.parse_trinomial(trinomial) #add a - seperator between parts aux_tris.append(self.join_parts('-', tri_parts)) p_tri_parts = tri_parts p_tri_parts['site'] = self.prepend_site_zeros(p_tri_parts['site'], 5) aux_tris.append(self.join_parts('-', p_tri_parts)) return aux_tris
def make_sort_trinomial(self, trinomial): """ make a sortable trinomial """ tri_man = TrinomialManage() tri_parts = tri_man.parse_trinomial(trinomial) prepended_site = self.prepend_site_zeros(tri_parts['site'], 5) return str(tri_parts['state']) + str(tri_parts['county']) + prepended_site
def match_trinomial_obj(self, tri): """ Attempts to match a trinomial object 'tri' against tDAR, if it hasn't yet been matched """ found_matches = 0 manifest = False try: manifest = Manifest.objects.get(uuid=tri.uuid) except Manifest.DoesNotExist: manifest = False la_check = LinkAnnotation.objects\ .filter(subject=tri.uuid, predicate_uri='dc-terms:subject', object_uri__contains=self.TDAR_VOCAB)[:1] if len(la_check) < 1 and manifest is not False: # we don't already have a tDAR id for this item, continue with matches tri_man = TrinomialManage() request_keywords = [tri.trinomial] if self.lead_zero_check: # check multiple leading zeros tri_parts = tri_man.parse_trinomial(tri.trinomial) site = tri_parts['site'] site_part_len = len(site) while len(site) < 4: site = '0' + site new_trinomial = tri_parts['state'] + tri_parts[ 'county'] + site request_keywords.append(new_trinomial) for keyword in request_keywords: tdar_api = tdarAPI() results = tdar_api.get_site_keyword(keyword) if isinstance(results, list): for result in results[:self.max_results]: # assume it is a spurious match match_real = False if result['label'] == tri.trinomial: # the trinomial and the tDAR result exactly match match_real = True else: # check if the only difference is in leading zeros tri_parts = tri_man.parse_trinomial(tri.trinomial) site = tri_parts['site'] site_part_len = len(site) while len(site) < 5: site = '0' + site new_trinomial = tri_parts['state'] + tri_parts[ 'county'] + site if new_trinomial == result['label']: # A good match, the tDAR result and the trinomial # match (but with different leading zeros) match_real = True if match_real: found_matches += 1 # OK! Found a match, first save the linked entity in the link entity table le_check = False try: le_check = LinkEntity.objects.get( uri=result['id']) except LinkEntity.DoesNotExist: le_check = False if le_check is False: le = LinkEntity() le.uri = result['id'] le.label = result['label'] le.alt_label = result['label'] le.vocab_uri = self.TDAR_VOCAB le.ent_type = 'type' le.save() # Now save the link annotation la = LinkAnnotation() la.subject = tri.uuid la.subject_type = manifest.item_type la.project_uuid = manifest.project_uuid la.source_id = 'tdar-api-lookup' la.predicate_uri = self.DC_TERMS_SUBJECT la.object_uri = result['id'] la.save() else: print('Almost! ' + result['label'] + ' is not exactly: ' + tri.trinomial) if tdar_api.request_error: self.request_error = True print('HTTP request to tDAR failed!') self.error_wait += self.base_wait if self.error_wait > self.max_wait: print('Too many failures, quiting...') sys.exit('Quitting process') else: # sleep some minutes before trying again print('Will try again in ' + str(self.error_wait) + ' seconds...') sleep(self.error_wait) else: self.request_error = False if self.error_wait >= self.base_wait: print('HTTP requests resumed OK, will continue.') self.error_wait = 0 return found_matches
def match_trinomial_obj(self, tri): """ Attempts to match a trinomial object 'tri' against tDAR, if it hasn't yet been matched """ found_matches = 0 manifest = False try: manifest = Manifest.objects.get(uuid=tri.uuid) except Manifest.DoesNotExist: manifest = False la_check = LinkAnnotation.objects\ .filter(subject=tri.uuid, predicate_uri='dc-terms:subject', object_uri__contains=self.TDAR_VOCAB)[:1] if len(la_check) < 1 and manifest is not False: # we don't already have a tDAR id for this item, continue with matches tri_man = TrinomialManage() request_keywords = [tri.trinomial] if self.lead_zero_check: # check multiple leading zeros tri_parts = tri_man.parse_trinomial(tri.trinomial) site = tri_parts['site'] site_part_len = len(site) while len(site) < 4: site = '0' + site new_trinomial = tri_parts['state'] + tri_parts['county'] + site request_keywords.append(new_trinomial) for keyword in request_keywords: tdar_api = tdarAPI() results = tdar_api.get_site_keyword(keyword) if isinstance(results, list): for result in results[:self.max_results]: # assume it is a spurious match match_real = False if result['label'] == tri.trinomial: # the trinomial and the tDAR result exactly match match_real = True else: # check if the only difference is in leading zeros tri_parts = tri_man.parse_trinomial(tri.trinomial) site = tri_parts['site'] site_part_len = len(site) while len(site) < 5: site = '0' + site new_trinomial = tri_parts['state'] + tri_parts['county'] + site if new_trinomial == result['label']: # A good match, the tDAR result and the trinomial # match (but with different leading zeros) match_real = True if match_real: found_matches += 1 # OK! Found a match, first save the linked entity in the link entity table le_check = False try: le_check = LinkEntity.objects.get(uri=result['id']) except LinkEntity.DoesNotExist: le_check = False if le_check is False: le = LinkEntity() le.uri = result['id'] le.label = result['label'] le.alt_label = result['label'] le.vocab_uri = self.TDAR_VOCAB le.ent_type = 'type' le.save() # Now save the link annotation la = LinkAnnotation() la.subject = tri.uuid la.subject_type = manifest.item_type la.project_uuid = manifest.project_uuid la.source_id = 'tdar-api-lookup' la.predicate_uri = self.DC_TERMS_SUBJECT la.object_uri = result['id'] la.save() else: print('Almost! ' + result['label'] + ' is not exactly: ' + tri.trinomial) if tdar_api.request_error: self.request_error = True print('HTTP request to tDAR failed!') self.error_wait += self.base_wait if self.error_wait > self.max_wait: print('Too many failures, quiting...') sys.exit('Quitting process') else: # sleep some minutes before trying again print('Will try again in ' + str(self.error_wait) + ' seconds...') sleep(self.error_wait) else: self.request_error = False if self.error_wait >= self.base_wait: print('HTTP requests resumed OK, will continue.') self.error_wait = 0 return found_matches
def find_trinomials_in_metadata(self, xml): """ finds trimomials in metadata """ recs = xml.xpath('//oai:record', namespaces=self.namespaces) print('Number of records in XML: ' + str(len(recs))) for rec in recs: tri_dict = { 'rec_uri': None, 'title': None, 'citation_html': None, 'source_label': 'Index of Texas Archaeology: Open Access Gray Literature from the Lone Star State', 'source_uri': 'http://scholarworks.sfasu.edu/ita', 'request_url': self.request_url, 'trinomials': [] } idents = rec.xpath('oai:metadata/oai_dc:dc/dc:identifier', namespaces=self.namespaces) rec_uri = None for ident_dom in idents: act_uri = ident_dom.text if rec_uri is None: rec_uri = act_uri if 'viewcontent.cgi' not in act_uri: rec_uri = act_uri tri_dict['rec_uri'] = rec_uri titles = rec.xpath('oai:metadata/oai_dc:dc/dc:title', namespaces=self.namespaces) for title_dom in titles: tri_dict['title'] = title_dom.text tri_dict[ 'trinomials'] = self.extract_texas_trinomials_from_text( title_dom.text, tri_dict['trinomials']) descripts = rec.xpath('oai:metadata/oai_dc:dc/dc:description', namespaces=self.namespaces) for description_dom in descripts: tri_dict[ 'trinomials'] = self.extract_texas_trinomials_from_text( description_dom.text, tri_dict['trinomials']) subjects = rec.xpath('oai:metadata/oai_dc:dc/dc:subject', namespaces=self.namespaces) for subject_dom in subjects: tri_dict[ 'trinomials'] = self.extract_texas_trinomials_from_text( subject_dom.text, tri_dict['trinomials']) tri_dict['citation_html'] = self.make_citation_html(rec) print('-----------------------------------') print(tri_dict['citation_html']) print('Trinomials: ' + str(tri_dict['trinomials'])) print('-----------------------------------') if len(tri_dict['trinomials']) > 0: # we found trinomials! So add to our list of publications with trinomials self.trinomial_refs.append(tri_dict) for trinomial in tri_dict['trinomials']: if trinomial not in self.unique_trinomials: self.unique_trinomials.append(trinomial) # now add the trinomialstate and county code to the list # of unique state and county codes if it is new # fist, we parse the trinomial to find the state and county parts tri_m = TrinomialManage() tri_p = tri_m.parse_trinomial(trinomial) state_county = tri_p['state'] + tri_p['county'] if state_county not in self.unique_state_counties: # we have a new state and county code, so add it to the list self.unique_state_counties.append(state_county)