示例#1
0
def intention_graph(path):
    with open(path, "r") as file:
        lines = file.readlines()
    texts = list()
    intention_list = list()
    for line in lines:
        text = json.loads(line)["intention_list"]
        texts.append(text)
    # print _uniout.unescape(str(texts), 'utf8')
    for intentions in texts:
        for intention in intentions:
            if intention not in intention_list:
                intention_list.append(intention)
    # print len(intention_list)
    # print _uniout.unescape(str(intention_list), 'utf8')
    intention_graph = dict()
    for intention in intention_list:
        key = intention
        value = list()
        for x in texts:
            if key in x and x.index(key) + 1 < len(x) and x[x.index(key) +
                                                            1] not in value:
                value.append(x[x.index(key) + 1])
        intention_graph[key] = value
    print _uniout.unescape(str(intention_graph), 'utf8')
    return intention_graph
示例#2
0
def test_unicode():
    with open('test_dump2.txt', 'w+') as out:
        with open('test_data.txt', 'r') as file:
            for line in file:
                text = json.loads(line)
                result = dict()
                question_list = text["question_list"]
                s = question_list[0]
                # print type(s)
                # s = s.decode("utf-8")
                print _uniout.unescape(s, "utf-8")
                result["question"] = s.decode("utf-8")
                out.write(json.dumps(result, ensure_ascii=False) + "\n")
示例#3
0
if __name__ == '__main__':
    with open(data_path, 'r') as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            a = line[0]
            if a not in classes:
                classes.append(a)
    # print("classes:", _uniout.unescape(str(classes), 'utf8'))

    with open(data_path, 'r') as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            a = line[0]
            b = line[1]
            a_slots = a.split(',')
            if b not in inputs:
                inputs[b] = []
                inputs[b].append(a)

            for x in inputs[b]:
                x_slots = x.split(',')
                if x_slots[0] == a_slots[0] and x_slots[1] != a_slots[1]:
                    inputs[b].append(a)

    for inp, intentions in inputs.iteritems():
        if len(intentions) >= 2:
            # results[inp] = intentions
            print(inp, _uniout.unescape(str(intentions), 'utf8'))

# print(_uniout.unescape(str(results), 'utf8'))
# print(_uniout.unescape(str(results['用卡取两百块']), 'utf8'))
            i["noun"] = i["noun"].replace(item.decode("utf-8"), 'Transportation')

        ##9##
        for item in shouldReplaceList9:
            i["noun"] = i["noun"].replace(item.decode("utf-8"), 'Venue')

        # print type(i["noun"])
        corpus += (i["noun"])

### WordCounts(ALL)
for doc in [corpus]:
    tf = Counter()
    for word in doc.split():
        tf[word] += 1
    for x,i in enumerate(tf.items()):
        print x+1,_uniout.unescape(str(i), 'utf8')

### 對 Dict 某條件下分組
# print "Original list:"
# pprint.pprint(data)
data.sort(key=operator.itemgetter('hotel'))
# pprint.pprint(data)

### group the departments in lists
list1 = []
for key, items in itertools.groupby(data, operator.itemgetter('hotel')):
    list1.append(list(items))
# print "After grouping the list by department:"
# pprint.pprint(list1)

### create a list of department number and average age in each department
示例#5
0
def cn(q):
    return _uniout.unescape(str(q), 'utf8')
示例#6
0
def print_cn(q, others=''):
    print(_uniout.unescape(str(q), 'utf8'), others)
示例#7
0
 def jieba_cut(self, input_):
     seg = " ".join(jieba.cut(input_, cut_all=False))
     tokens = _uniout.unescape(str(seg), 'utf8')
     return tokens
示例#8
0
def print_cn(*q):
    print(_uniout.unescape(','.join(q), 'utf8'))
示例#9
0
 def cut(self, input_):
     input_ = QueryUtils.static_simple_remove_punct(input_)
     seg = " ".join(jieba.cut(input_, cut_all=False))
     tokens = _uniout.unescape(str(seg), 'utf8')
     return tokens
示例#10
0
 def cut(self, input_):
     input_ = QueryUtils.static_remove_cn_punct(input_)
     tokens = jieba.cut(input_, cut_all=True)
     seg = " ".join(tokens)
     tokens = _uniout.unescape(str(seg), 'utf8')
     return tokens