def test_normalize(): normalize(pageDict) print(page.hub for addr, page in nlp.pagesIndex.items()) expected_hub = [1/91**0.5, 2/91**0.5, 3/91**0.5, 4/91**0.5, 5/91**0.5, 6/91**0.5] # Works only for sample data above expected_auth = list(reversed(expected_hub)) assert len(expected_hub) == len(expected_auth) == len(nlp.pagesIndex) assert expected_hub == [page.hub for addr, page in sorted(nlp.pagesIndex.items())] assert expected_auth == [page.authority for addr, page in sorted(nlp.pagesIndex.items())]
def __init__(self, filename): f = open(filename) text = nlp.normalize(f.read().decode('utf-8')) f.close() self.title = re.search(r'title:\n(.+)\n', text).group(1) self.singer = re.search(r'singer:\n(.+)\n', text).group(1) self.writer = re.search(r'writer:\n(.+)\n', text).group(1) self.composer = re.search(r'composer:\n(.+)\n', text).group(1) self.year = re.search(r'year:\n(.+)\n', text).group(1) self.sex = re.search(r'sex:\n(.+)\n', text).group(1) self.lyric = re.search(r'lyric:\n((?:.+\n)+)', text).group()
def readFile(url): with open(url, encoding='utf-8') as FILE: JSON = json.load(FILE) for data in JSON: DATUM.append(data) tag = data['tags'] TAGS.append((tag, 0)) for pattern in data['patterns']: _pattern = normalize(pattern) VOCABULARY.extend(tokenization(_pattern)) TAG_PATTERN.append((tag, _pattern))
def loadFile(filename): f = open(filename) text = nlp.normalize(f.read().decode('utf-8')) f.close() texts = split(text) docs = [] for text in texts: doc = parse(text) docs.append(doc) return docs
def add_to_index(headers): index = get_index() if index == {}: index = defaultdict(list) for header in headers: header_words = nlp.normalize(header.split(' ^ ')[1]) for word in header_words: index[word].append(header.split(' ^ ')[0]) index_lst = [] for x in index: if type(x) is not list: index_lst.append(x + ' ^ ' + ' '.join(index[x]) + '\n') db.write_to_base(db.index_db, index_lst)
def prepareSlotValuesIndependent(): domains = ['restaurant', 'hotel', 'attraction', 'train', 'taxi', 'hospital', 'police'] requestables = ['phone', 'address', 'postcode', 'reference', 'id'] dic = [] dic_area = [] dic_food = [] dic_price = [] # read databases for domain in domains: try: fin = file('db/' + domain + '_db.json') db_json = json.load(fin) fin.close() for ent in db_json: for key, val in ent.items(): if val == '?' or val == 'free': pass elif key == 'address': dic.append((normalize(val), '[' + domain + '_' + 'address' + ']')) if "road" in val: val = val.replace("road", "rd") dic.append((normalize(val), '[' + domain + '_' + 'address' + ']')) elif "rd" in val: val = val.replace("rd", "road") dic.append((normalize(val), '[' + domain + '_' + 'address' + ']')) elif "st" in val: val = val.replace("st", "street") dic.append((normalize(val), '[' + domain + '_' + 'address' + ']')) elif "street" in val: val = val.replace("street", "st") dic.append((normalize(val), '[' + domain + '_' + 'address' + ']')) elif key == 'name': dic.append((normalize(val), '[' + domain + '_' + 'name' + ']')) if "b & b" in val: val = val.replace("b & b", "bed and breakfast") dic.append((normalize(val), '[' + domain + '_' + 'name' + ']')) elif "bed and breakfast" in val: val = val.replace("bed and breakfast", "b & b") dic.append((normalize(val), '[' + domain + '_' + 'name' + ']')) elif "hotel" in val and 'gonville' not in val: val = val.replace("hotel", "") dic.append((normalize(val), '[' + domain + '_' + 'name' + ']')) elif "restaurant" in val: val = val.replace("restaurant", "") dic.append((normalize(val), '[' + domain + '_' + 'name' + ']')) elif key == 'postcode': dic.append((normalize(val), '[' + domain + '_' + 'postcode' + ']')) elif key == 'phone': dic.append((val, '[' + domain + '_' + 'phone' + ']')) elif key == 'trainID': dic.append((normalize(val), '[' + domain + '_' + 'id' + ']')) elif key == 'department': dic.append((normalize(val), '[' + domain + '_' + 'department' + ']')) # NORMAL DELEX elif key == 'area': dic_area.append((normalize(val), '[' + 'value' + '_' + 'area' + ']')) elif key == 'food': dic_food.append((normalize(val), '[' + 'value' + '_' + 'food' + ']')) elif key == 'pricerange': dic_price.append((normalize(val), '[' + 'value' + '_' + 'pricerange' + ']')) else: pass # TODO car type? except: pass if domain == 'hospital': dic.append((normalize('Hills Rd'), '[' + domain + '_' + 'address' + ']')) dic.append((normalize('Hills Road'), '[' + domain + '_' + 'address' + ']')) dic.append((normalize('CB20QQ'), '[' + domain + '_' + 'postcode' + ']')) dic.append(('01223245151', '[' + domain + '_' + 'phone' + ']')) dic.append(('1223245151', '[' + domain + '_' + 'phone' + ']')) dic.append(('0122324515', '[' + domain + '_' + 'phone' + ']')) dic.append((normalize('Addenbrookes Hospital'), '[' + domain + '_' + 'name' + ']')) elif domain == 'police': dic.append((normalize('Parkside'), '[' + domain + '_' + 'address' + ']')) dic.append((normalize('CB11JG'), '[' + domain + '_' + 'postcode' + ']')) dic.append(('01223358966', '[' + domain + '_' + 'phone' + ']')) dic.append(('1223358966', '[' + domain + '_' + 'phone' + ']')) dic.append((normalize('Parkside Police Station'), '[' + domain + '_' + 'name' + ']')) # add at the end places from trains fin = file('db/' + 'train' + '_db.json') db_json = json.load(fin) fin.close() for ent in db_json: for key, val in ent.items(): if key == 'departure' or key == 'destination': dic.append((normalize(val), '[' + 'value' + '_' + 'place' + ']')) # add specific values: for key in ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']: dic.append((normalize(key), '[' + 'value' + '_' + 'day' + ']')) # more general values add at the end dic.extend(dic_area) dic.extend(dic_food) dic.extend(dic_price) return dic
if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description='lyric analyzer') parser.add_argument('infile', nargs='*') args = parser.parse_args() filenames = args.infile songs = [] for filename in filenames: # song = load_lyric_file(filename) song = SongInfo(filename) songs.append(song) docs = [] for s in songs: text = nlp.normalize(s.lyric) terms = nlp.tokenizer(text) terms = nlp.extract_noun(terms) terms = nlp.remove_stopword(terms) s.terms = [t.basic_form for t in terms] # dist = distribution([s.getDate(month=True) for s in songs]) # for k, v in sorted(dist.items()): # print k, v # dist = distribution([s.sex for s in songs]) # for k, v in sorted(dist.items()): # print k, v # by year year_labels = [s.getDate() for s in songs] year_docs = {}
def queryResultVenues(domain, turn, real_belief=False): # query the db sql_query = "select * from {}".format(domain) if real_belief == True: items = turn.items() elif real_belief=='tracking': for slot in turn[domain]: key = slot[0].split("-")[1] val = slot[0].split("-")[2] if key == "price range": key = "pricerange" elif key == "leave at": key = "leaveAt" elif key == "arrive by": key = "arriveBy" if val == "do n't care": pass else: if flag: sql_query += " where " val2 = val.replace("'", "''") val2 = normalize(val2) if key == 'leaveAt': sql_query += key + " > " + r"'" + val2 + r"'" elif key == 'arriveBy': sql_query += key + " < " + r"'" + val2 + r"'" else: sql_query += r" " + key + "=" + r"'" + val2 + r"'" flag = False else: val2 = val.replace("'", "''") val2 = normalize(val2) if key == 'leaveAt': sql_query += r" and " + key + " > " + r"'" + val2 + r"'" elif key == 'arriveBy': sql_query += r" and " + key + " < " + r"'" + val2 + r"'" else: sql_query += r" and " + key + "=" + r"'" + val2 + r"'" try: # "select * from attraction where name = 'queens college'" return dbs[domain].execute(sql_query).fetchall() except: return [] # TODO test it pass else: items = turn['metadata'][domain]['semi'].items() flag = True for key, val in items: if val == "" or val == "dontcare" or val == 'not mentioned' or val == "don't care" or val == "dont care" or val == "do n't care": pass else: if flag: sql_query += " where " val2 = val.replace("'", "''") val2 = normalize(val2) if key == 'leaveAt': sql_query += r" " + key + " > " + r"'" + val2 + r"'" elif key == 'arriveBy': sql_query += r" " +key + " < " + r"'" + val2 + r"'" else: sql_query += r" " + key + "=" + r"'" + val2 + r"'" flag = False else: val2 = val.replace("'", "''") val2 = normalize(val2) if key == 'leaveAt': sql_query += r" and " + key + " > " + r"'" + val2 + r"'" elif key == 'arriveBy': sql_query += r" and " + key + " < " + r"'" + val2 + r"'" else: sql_query += r" and " + key + "=" + r"'" + val2 + r"'" try: # "select * from attraction where name = 'queens college'" return dbs[domain].execute(sql_query).fetchall() except: return [] # TODO test it
if (inputBag[index] == patternBag[index] or inputBag[index] and patternBag[index]): hit += patternBag[index] index = TAGS.index((dup_tag_pattern[dup_tag_pattern.index(pattern)][0], 0)) if pattern[0] == dup_tags[index][0]: tmp = list(dup_tags[index]) tmp[1] += hit/patternLength dup_tags[index] = tuple(tmp) #find BIGGEST rate max = 0.5 index = -1 for i in range(len(dup_tags)): if dup_tags[i][1] > max: max = dup_tags[i][1] index = i #response if index == -1: print("AI : Xin lỗi tôi không hiểu") else: responseLength = len(DATUM[index]['response']) print("AI: ",DATUM[index]['response'][random.randrange(0, responseLength)]) if index == 1: return 0 else: return 1 readFile(URL) print('Chào mừng đến với ChatBot!\n') while True: userInput = input("You: ") userInput = normalize(userInput) inputLength = len(userInput.split(" ")) inputBag = getBag(userInput, VOCABULARY) if not training(inputBag): break print('ChatBot kết thúc...')
def load_tweets_from_csv(filename): ret = csv.reader(open(filename)) tweets = [nlp.normalize(r[5].decode('utf-8')) for r in ret] tweets = [t for t in tweets if not u'@' in t] return tweets