Exemplo n.º 1
0
def first(stmt, search_set):
    if len(stmt) == 0:
        return search_set
    elif stmt[0] in terminal_set:
        return set([stmt[0]])
    elif stmt[0] in non_terminal_set:
        re = first_set(stmt[0])

        if '' in re:
            re = re.copy()
            re.remove('')
            re = re.union(first(stmt[1:], search_set))

        return re
Exemplo n.º 2
0
def first_for_stat(left, statement):
    global terminal_set
    global non_terminal_set

    if len(statement) == 0:
        return follow(left)
    else:
        word = statement[0]
        if word in terminal_set:
            return set([word])
        elif word in non_terminal_set:
            re = first(word)
            if '' in re:
                re = re.union(first_for_stat(left, statement[1:]))
                re.remove('')
                return re
            else:
                return re
        portion = os.path.splitext(filename)
        if portion[1] == ".dat":
            # recombine file name
            newname = portion[0] + ".txt"
            filenamedir = path_1 + filename
            newnamedir = path_1 + newname
            # os.rename(filename, newname)
            os.rename(filenamedir, newnamedir)


#提取爬虫文件中的文本,并对文本进行处理,删除标点符号
add_punc = ',。、【】“”:;()《》‘’{}<>!?/=~★◥◣█◢ ▌▅▇█◤●!▃▃▁▂▃?一→¥▲一⑦()、%^>℃:.”“^-——=&#@$❤*+▆.'
all_punc = punctuation + add_punc

with open('demo/newsVec_train.csv',
          encoding='utf-8') as rf, open(r'demo/newsVec_train_process.csv',
                                        'w',
                                        newline='',
                                        encoding='utf-8') as wf:
    reader = csv.reader(rf)
    writer = csv.writer(wf)
    for row in reader:
        re = []
        for i in row[2]:
            re.append(i)
            if i in add_punc:
                re.remove(i)
        sec = [''.join(re)]
        print(sec)
        writer.writerow(sec)