Python tokenize_text 예제들, mappening.api.utils.tokenizer.tokenize_text Python 예제들

예제 #1

0

파일 보기

def add_locations_from_collection():
    # Update locations or insert new locations from events in db
    updated_locations = []
    added_locations = []

    # Can change what collection we get locations from
    new_locations = get_locations_from_collection()

    # For every location from events db
    for new_loc in new_locations:
        # Find location of same coordinates/name
        coord_loc = locations_collection.find_one(
            {
                'location.latitude':
                new_loc['location'].get('latitude', INVALID_COORDINATE),
                'location.longitude':
                new_loc['location'].get('longitude', INVALID_COORDINATE)
            }, {'_id': False})
        alt_name_loc = None

        # Tokenize and remove unnecessary/common words
        place_name = new_loc['location'].get('name')
        if place_name:
            place_name = re.sub(r'(UCLA-|-UCLA)+\s?',
                                '',
                                place_name,
                                flags=re.IGNORECASE)
            place_name = tokenizer.tokenize_text(place_name)
            processed_place = re.compile(place_name, re.IGNORECASE)
            alt_name_loc = locations_collection.find_one(
                {'location.alternative_names': processed_place},
                {'_id': False})

        # If there exists a pre-existing location with matching coordinates/name
        # Location already in db but missing info
        # Merge new info with db document
        if coord_loc or alt_name_loc:
            loc_result = None
            if coord_loc and not alt_name_loc:
                loc_result = location_processor.handle_keys(
                    coord_loc, new_loc, place_name)
            else:
                loc_result = location_processor.handle_keys(
                    alt_name_loc, new_loc, place_name, True)

            if loc_result:
                updated_locations.append(loc_result)
        else:
            # No pre-existing location so insert new location to db
            # Also add stripped version of name to location info
            if place_name and place_name != new_loc['location']['name'].lower(
            ):
                new_loc['location']['alternative_names'].append(place_name)
            added_locations.append(new_loc)
            locations_collection.insert_one(new_loc.copy())

    return jsonify({
        'Added Locations': added_locations,
        'Updated Locations': updated_locations
    })

예제 #2

0

파일 보기

파일: location_utils.py 프로젝트: minj131/Mappening-Backend

def search_locations(place_query):
    output = []
    output_places = []

    # Supplied string such as "Boelter Hall" for a location
    print("Original place query: " + place_query)
    # Remove leading/trailing white space
    place_query = place_query.strip()

    # Search for exact match first
    # Sometimes regency village weighted more than sunset village due to repetition of village
    processed_query = location_helpers.process_query(place_query)
    print("Processed place query: " + processed_query)
    place_regex = re.compile("^" + processed_query + "$", re.IGNORECASE)
    places_cursor = locations_collection.find({'location.alternative_names': place_regex})
    
    # Places that match the name are appended to output
    if places_cursor.count() > 0:
      for place in places_cursor:
        output.append(location_helpers.append_location(place))
        output_places.append(place['location'].get('name', "NO NAME"))
      return output

    print("Doing text search...")

    # Tokenize query
    tokenized_query = tokenizer.tokenize_text(processed_query)
    print("Tokenized place query: " + tokenized_query)

    # Locations db has text search index on alternate_locations field
    # Search for locations that match words in processed place query
    # Default stop words for english language, case insensitive
    # Sort by score (based on number of occurances of query words in alternate names)
    # Can limit numer of results as well
    places_cursor = locations_collection.find( 
      { '$text': { '$search': tokenized_query, '$language': 'english', '$caseSensitive': False } },
      { 'score': { '$meta': 'textScore' } }
    ).sort([('score', { '$meta': 'textScore' })]) #.limit(3)

    # Places that match the alternate name are appended to output if not already
    # part of output
    if places_cursor.count() > 0:
      for place in places_cursor:
        # Check if already added by maintaining list of places added by name
        if place['location'].get('name', "NO NAME") not in output_places:
          output.append(location_helpers.append_location(place, True))
          output_places.append(place['location'].get('name', "NO NAME"))

    return output

예제 #3

0

파일 보기

파일: locations.py 프로젝트: minj131/Mappening-Backend

def tokenize_names():
  places = []
  updated = False

  # Go through every location in json
  for location in data['locations']:
    place = location
    if 'alternative_names' in place['location']:
      for alt_name in place['location']['alternative_names']:
        processed_name = tokenizer.tokenize_text(alt_name)
        if processed_name not in (name.lower() for name in place['location']['alternative_names']):
          if processed_name:
            place['location']['alternative_names'].append(processed_name)
            updated = True
      if updated:
        places.append(place)
        updated = False

  return jsonify({"locations": places})

예제 #4

0

파일 보기

파일: locations.py 프로젝트: minj131/Mappening-Backend

def fill_location_data():
  places = []
  updated_places = []
  updated = False

  # Go through every location in json
  for location in data['locations']:
    place = location
    # Add stripped down name to alternative_names
    if 'name' in location['location']:
      processed_place = tokenizer.tokenize_text(location['location']['name'])
      if 'alternative_names' in location['location']:
        if location['location']['name'].lower() not in (name.lower() for name in location['location']['alternative_names']):
          if location['location']['name']:
            place['location']['alternative_names'].append(location['location']['name'])
            updated = True
        if processed_place not in (name.lower() for name in location['location']['alternative_names']):
          if processed_place:
            place['location']['alternative_names'].append(processed_place)
            updated = True
    # No street or zip information, try to find it
    if 'street' not in location['location'] or 'zip' not in location['location'] or location['location']['street'] == '' or location['location']['zip'] == '':
      if 'name' in location['location']:
        # Use location name to try to find location info
        search_results = google_textSearch(location['location']['name'])
        if search_results:
          # Assume first result is best result/most relevant result
          # Set street to the address
          place['location']['street'] = search_results[0]['address']

          # Extract zip code from address
          re_result = re.search(r'(\d{5}(\-\d{4})?)', place['location']['street'])
          if re_result:
            place['location']['zip'] = re_result.group(0) # Sometimes get 5 digit address numbers
          else:
            place['location']['zip'] = "NO ZIP"
          updated = True
      else:
        # Without a name, street, or zip cannot find out much about location
        # Is it even a location at this point lmao
        place['location']['street'] = "NO STREET"
        place['location']['zip'] = "NO ZIP"
        place['location']['name'] = "NO NAME"
    # Check if latitude/longitude is filled out (420 is default value)
    if 'latitude' not in location['location'] or 'longitude' not in location['location'] or location['location']['latitude'] == 420 or location['location']['longitude'] == 420:
      if 'name' in location['location']:
        # Use location name to try to find location info
        search_results = google_textSearch(location['location']['name'])
        if search_results:
          # If there are results see if there is a latitude/longitude
          if search_results[0]['latitude'] == "NO LATITUDE" or search_results[0]['longitude'] == "NO LONGITUDE":
            place['location']['latitude'] = 404
            place['location']['longitude'] = 404
          else:
            place['location']['latitude'] = search_results[0]['latitude']
            place['location']['longitude'] = search_results[0]['longitude']
            updated = True
      # If there is no name, see if there is street info
      elif 'street' in place['location'] and place['location']['street'] != "NO STREET" and place['location']['street'] != '':
        # Use name to try to find location info
        search_results = google_textSearch(place['location']['street'])
        if search_results:
          if search_results[0]['latitude'] == "NO LATITUDE" or search_results[0]['longitude'] == "NO LONGITUDE":
            place['location']['latitude'] = 404
            place['location']['longitude'] = 404
          else:
            place['location']['latitude'] = search_results[0]['latitude']
            place['location']['longitude'] = search_results[0]['longitude']
            updated = True
      else:
        # There was no name or street info, set to another junk value
        place['location']['latitude'] = 666
        place['location']['longitude'] = 666

    # If we want to keep track of all places from json data uncomment this
    # places.append(place)

    # Keep track of places with info that was actually updated
    if updated:
      updated_places.append(place)
    updated = False

  # Return json info on updated locations and/or all locations from json data
  # return jsonify({"locations": places, "changed locations": updated_places})
  return jsonify({"New/Modified Locations": updated_places})