def get_disambi_title(self, infile): disambi_title = {} for line in LoadFile.readline(infile): words = line.strip().split("\",\"") title_tmp = Clean.clean_word(words[1], clean_level="title") disambi_tmp = Clean.clean_word(words[0], clean_level="disambi") # title_tmp = title_tmp.strip().strip("\"") disambi_title[disambi_tmp] = title_tmp return disambi_title
def tokenize(articles): results = [] tokenizer = MeCabTokenizer( user_dic_path='/usr/local/lib/mecab/dic/mecab-ipadic-neologd') for article in articles: clean = Clean(article.html) cleaned = clean.clean_html_and_js_tags().clean_text().clean_code() tokens = tokenizer.extract_noun_baseform(cleaned.text) results.append(tokens) return list(chain.from_iterable(results))
def main(): with open("./410_baidu/410_disambi_infobox.csv", 'r', encoding='UTF-8') as inf: lines = inf.readlines() f = open("./410_baidu/410_disambi_infobox_out.csv", "w", encoding='utf-8') list_attr = [] title_list = get_word_list("./410_baidu/410_title.csv") err_count = 0 counts = {} for line in tqdm(lines): words = line.strip().split(",") disambi = Clean.clean_word(words[0], clean_level='disambi') infobox = ",".join(words[1:]) try: info_dict = json.loads(json.loads(infobox)) for attr in info_dict.keys(): clean_attr = Clean.clean_word(attr) info_dict[clean_attr] = info_dict.pop(attr) value = info_dict[clean_attr] clean_attr = clean_attr counts[clean_attr] = counts.setdefault(clean_attr, 0) + 1 list_attr.append(clean_attr) value_split = re.split(u"[,。、,/]", value.strip()) for v in value_split: v = Clean.clean_word(v).strip(u"等").strip(u"收起") title_list.append(v) f.write("\"" + disambi + "\",\"" + clean_attr + "\",\"" + v + "\"" + "\r\n") except Exception as e: print(e) err_count += 1 title_list = [t.strip(u"\\") for t in title_list] title_list = list(set(title_list)) list_attr = list(set(list_attr)) sort_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True) with open("./sort_counts.txt", "w", encoding='utf-8') as ouf: for i in sort_counts: ouf.write(str(i) + "\n") with open("./all_attr.txt", "w", encoding='utf-8') as ouf: for word_counts in sort_counts: if word_counts[1] >= 10: ouf.write(str(word_counts[0]) + "\n") with open("./410_baidu/410_title_new.csv", "w", encoding='utf-8') as ouf: for i in title_list: ouf.write("\"" + i + "\"\r\n") with open("./410_baidu/all_attr.txt", "w", encoding='utf-8') as ouf: for i in list_attr: ouf.write(i + "\n") print("err_count: ", err_count)
def download(videoId): try: option = Options.AUDIO if request.args.get( "option", "") == "audio" else Options.BOTH filename = YoutubeDownloader.download(videoId, option) if not filename: raise Clean.scheduleRemove(filename) return send_file(os.path.join(".", filename)) except YoutubeDownloader.VideoNotFoundException: abort(404) except Exception as e: app.logger.error(e) abort(500)
def clean_disambi_redirect(infile="source", outfile="target"): with open(infile) as inf: reader = csv.reader(inf) err_counts = 0 with open(outfile, "w") as ouf: for line in tqdm(reader): if len(line) != 2: err_counts += 1 continue print(line) disambi = Clean.clean_word(line[0], clean_level='disambi') redirect = Clean.clean_word(line[1], clean_level='redirect') ouf.write("\"" + disambi + "\",\"" + redirect + "\"\n") print("err_counts for disambi_redirect:%d" % (err_counts))
def clean_title_disambi(infile="title_disambi.csv", outfile="title_disambi_out.csv"): with open(infile, "r",encoding='utf-8') as inf: lines = inf.readlines() err_counts = 0 with open(outfile, "w",encoding='utf-8') as ouf: for line in tqdm(lines): words = line.strip().split("\",\"") if len(words) != 2: err_counts += 1 continue title = Clean.clean_word(words[0], clean_level='title') disambi = Clean.clean_word(words[1], clean_level='disambi') ouf.write("\"" + title + "\",\"" + disambi + "\"\r\n") print("err_counts for disambi_redirect: ", err_counts)
def run(self): #本地变量,减少锁并发冲突 times = 0 etimes = 0 print ("Starting ", self.threadName, "...") while True: try: value = Writer.queue.get_nowait() except queue.Empty: etimes += 1 if etimes%50 == 0 and times != 0: Writer.writeLock.acquire() Writer.writeSum += times Writer.writeLock.release() times = 0 etimes = 0 time.sleep(0.5) continue else: data = Clean.wash(value) self.putData(data) times += 1 #多线程数据加锁 if times%50 == 0: Writer.writeLock.acquire() Writer.writeSum += times Writer.writeLock.release() times = 0
def get_word_list(filename): with open(filename, "r") as inf: lines = inf.readlines() # print "type line: ", type(lines[0].encode("utf-8")) lines = [ Clean.clean_word(line.decode('utf-8'), clean_level='title') for line in lines ] return lines
def get_title(infile): all_title = set([]) for line in LoadFile.readline(infile): title_tmp = Clean.clean_word(line.strip(), clean_level="title") title_tmp = title_tmp.strip().strip("\"") if title_tmp == "": continue all_title.add(title_tmp) return all_title
def clean_disambi_subject(infile="disambi_subject.csv", outfile="disambi_subject_out.csv"): with open(infile) as inf: lines = inf.readlines() err_counts = 0 with open(outfile, "w") as ouf: for line in tqdm(lines): words = line.strip().split("\",\"") if len(words) != 2: err_counts += 1 continue disambi = Clean.clean_word( words[0].decode('utf-8'), clean_level='disambi').encode('utf-8') subject = Clean.clean_word( words[1].decode('utf-8'), clean_level='subject').encode('utf-8') ouf.write("\"" + disambi + "\",\"" + subject + "\"\r\n") print "err_counts for disambi_redirect: ", err_counts
def add_schedule(self): self.__clear__() set_clean = Schedule( ) # calls an instance of schedule to be made and checked if set_clean.check(): print "(+) Time to Clean " new_clean = Clean() else: print "(-) It is not yet time to clean " set_clean.save() set_clean.show() new = Menu()
def clean_disambi_literal(infile="source", outfile="target"): with open(infile) as inf: reader = csv.reader(inf) err_counts = 0 with open(outfile, "w") as ouf: for line in tqdm(reader): if len(line) != 2: err_counts += 1 continue disambi = Clean.clean_word(line[0], clean_level='disambi') literal = Clean.clean_word(line[1], clean_level='literal') if literal != '' and disambi != '': if '[朱槿品种]' in disambi: literal = '快乐' disambi = '快乐[[朱槿品种]]' if '"' in literal: literal = literal.replace('"', '""') if '\\' in literal: literal = literal.replace('\\', '') if '"' in disambi: disambi = disambi.replace('"', '""') ouf.write("\"" + disambi + "\",\"" + literal + "\"\n") print("err_counts for disambi_redirect:%d" % (err_counts))
class HelloRPC(object): def __init__(self): # self.classifier = HackAriba() self.clean = Clean() def get_sentiment_of_list_of_tweets(self, list_of_tweets): list_of_text =[] for key, value in list_of_tweets.iteritems(): list_of_text.append(value) print list_of_text clean_text = self.clean.clean_data_to_feed_classifier(list_of_text) print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" print clean_text print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" return clean_text
def __main__(): nj_municipals = json.load(open('./json/nj_municipals.json')) counties = list(nj_municipals.keys()) if len(sys.argv) == 1: url, date = Link() elif len(sys.argv) == 2: _, date = Link() url = sys.argv[1] else: url = sys.argv[1] date = sys.argv[2] print(url) print(date) data = Parse(url, counties) total_df = Clean(csv_file, data, date, nj_municipals) Update(total_df, csv_file) Today(total_df, date, counties, json_file)
#!/usr/bin/env python # -*- coding: utf-8 -*- import os try: import datetime from Login import Login from readconfig import ReadConfig from clean import Clean lj = Login() dc = Clean() _info = ReadConfig() except Exception as e: print('配置文件缺失%s' %e) print('输入enter停止') _k= input() os._exit(0) def main(): try: print('该程序用于清洗公司名称数据->修改配置文件即可运行') print('版本:1.3') print('输入enter开始') k1=input() #读取数据库原始数据 print('正在清洗数据->请稍后..') info1 = _info.get_input("col_name") #字段名称 info2 = _info.get_input("table_name") #表名->有模式需要加.配置完整表名 sql1 = ''' select {}
def get_word_list(in_f): #with open(filename, "r") as in_f: reader = csv.reader(in_f) lines = [Clean.clean_word(line[0], clean_level='literal') for line in reader] lines.remove('') return lines
def main(): with open("source/disambi.csv") as in_f_disambi, open("source/infobox.csv", "r") as in_f_infobox,\ open('source/literal.csv') as in_f_literal, open("target/disambi_infobox.csv", "w") as out_f: literal_list = get_word_list(in_f_literal) disambi_reader = csv.reader(in_f_disambi) info_lines = in_f_infobox.readlines() list_attr = [] list_value = [] err_count = 0 attr_counts = {} for (disambi, infobox) in tqdm(zip(disambi_reader, info_lines)): disambi = Clean.clean_word(disambi[0], clean_level='disambi') if '"' in disambi: disambi = disambi.replace('"', '""') if infobox != '{}': try: #print(json.loads(infobox)) info_dict = json.loads(json.loads(infobox).replace("\\", r"\\")) clean_info_dict = {} for attr in info_dict.keys(): clean_attr = Clean.clean_word(attr, clean_level='others') if clean_attr not in clean_info_dict.keys(): clean_info_dict[clean_attr] = info_dict[attr] for clean_attr in clean_info_dict.keys(): value = str(','.join(clean_info_dict[clean_attr])) if clean_info_dict[clean_attr] != [] else None if value: #value = value.replace('\"','').replace("\\",'').replace('"','""') value = value.replace('"','""') attr_counts[clean_attr] = attr_counts.setdefault(clean_attr, 0) + 1 # Collect Attr. Frequency list_attr.append(clean_attr) ####### #literal_list.append(value) list_value.append(value) out_f.write("\"" + disambi + "\",\"" + clean_attr + "\",\"" + value + "\"" + "\n") ####### #value_split = re.split(u"[,。、,/]", value.strip()) #for v in value_split: #v = Clean.clean_word(v).strip(u"等").strip(u"收起") #v = v.strip(u"等").strip(u"收起") #if len(v) > 0: #literal_list.append(v) #list_value.append(v) #out_f.write("\"" + disambi + "\",\"" + clean_attr + "\",\"" + v + "\"" + "\r\n") except Exception as e: print(f'Error:{e},Disambi:{disambi},Infobox:{infobox}') err_count += 1 #break literal_list = [t.replace('\"','').replace("\\",'').replace('"','""') for t in literal_list] literal_list = list(set(literal_list)) list_attr = list(set(list_attr)) list_value = list(set(list_value)) sort_counts = sorted(attr_counts.items(),key = lambda x:x[1],reverse = True) with open("target/sorted_all_attr.txt", "w") as ouf: for i in sort_counts: ouf.write(str(i) + "\n") with open("target/sorted_filerted_attr.txt", "w") as ouf: for word_counts in sort_counts: if word_counts[1] >= 10: ouf.write(str(word_counts[0]) + "\n") with open("target/literal.csv", "w") as ouf: for i in literal_list: ouf.write("\"" + i + "\"\n") with open("target/attr.txt", "w") as ouf: for i in list_attr: ouf.write(i + "\n") with open("target/value.csv", "w") as ouf: for i in list_value: ouf.write("\"" + i + "\"\n") print("err_count: ", err_count)
''' 将disambi名称进行清洗,其余不变 ''' import re from clean import Clean from tqdm import tqdm import csv with open("source/disambi_attrs.csv") as inf: title_dict = {} err_count = 0 reader = csv.reader(inf) for line in tqdm(reader): curLink = line[-2] exterLink = line[-1] clean_disambi = Clean.clean_word(line[0], 'disambi') if '"' in clean_disambi: clean_disambi = clean_disambi.replace('"', '""') if curLink == 'http://www.baike.com/wiki/%22': clean_disambi = '""[标点符号]' if len(line) < 5: print(f'\n{line},{len(line)}') err_count += 1 literal = '""' abstract = Clean.clean_word(line[1], 'others').strip() else: literal = Clean.clean_word(line[0], 'title') abstract = line[2] if len(line) == 5 else ''.join(line[2:-2]) abstract = abstract.replace('编辑摘要 ', '').replace('"', "'").strip() title_dict[clean_disambi] = [literal, abstract, curLink, exterLink] print("Error count:%d" % (err_count))
#!/usr/bin/env python # coding=utf-8 import re from clean import Clean from tqdm import tqdm with open("./410_baidu/410_disambi.csv", "r", encoding='utf-8') as inf: title_dict = {} count = 0 lines = inf.readlines() for line in tqdm(lines): words = line.strip().split("\",\"") if len(words) != 4: count += 1 clean_disambi = Clean.clean_word(words[0], 'disambi') title_dict[clean_disambi] = words[1:] print("Error lines: ", count) with open("./410_baidu/410_disambi_new.csv", "w", encoding='utf-8') as ouf: for i in title_dict.keys(): ouf.write("\"" + i + "\",\"" + "\",\"".join(title_dict[i]) + "\r\n")
pricing = os.path.join(BASE_DIR, 'data/pricing_data.csv') df = pd.read_csv(pricing, encoding='unicode_escape') # Drops rows will all values missing df.dropna(how='all', inplace=True) # Returns the string without non ASCII characters clean = df.applymap(cd.remove_non_ascii, na_action='ignore') # Finds pounds, or pounds and pence and returns as a float clean['price'] = clean['price'].map(cd.pounds_and_pence, na_action='ignore') clean.boxplot('price') # plt.savefig('graphs/price_boxplot.png', dpi=400, bbox_inches='tight') plt.show() checkin_dates = os.path.join(BASE_DIR, 'collection/dates.json') with open(checkin_dates) as f: checkins = json.load(f) dates = checkins['checkin'] """Calculates interquartile range and removes outliers""" for date in dates: date_group = clean.groupby(['date']).get_group(date) LL, UL = cd.outlier_limits(date_group['price']) outliers = date_group['price'][(date_group['price'] < LL) | (date_group['price'] > UL)] clean.drop(outliers.index, axis=0, inplace=True) output = os.path.join(BASE_DIR, 'data/cleaned_pricing_data.csv') clean.to_csv(output, index=False)
import csv import nltk import time import gensim from gensim.models import Word2Vec from gensim.models import KeyedVectors from progress.bar import Bar from progress.spinner import Spinner # Import own files from clean import Clean cleaner = Clean() # Increase csv field size limit csv.field_size_limit(2**30) NUM_DOCS = 17153 # for progress bar purposes only sentences = [] with open("dataset.csv", newline='', encoding='utf-8') as csvfile: #Start time start = time.time() # Start progress bar. max obtained from reading in the excel file and checking number of rows indexing_progress_bar = Bar("Reading in documents to train Word2Vec Model", max=NUM_DOCS) # Read in CSV dataset and remove headers from consideration csv_reader = csv.reader(csvfile)
import re import json import re from tqdm import tqdm from clean import Clean def get_word_list(filename): with open(filename, "r", encoding='utf-8') as inf: lines = inf.readlines() # print "type line: ", type(lines[0].encode("utf-8")) lines = [Clean.clean_word(line, clean_level='title') for line in lines] return lines print(Clean.clean_word(u"\"你好 呀#\"$%^&*@!,。、;:‘’】季 候【")) def main(): with open("./410_baidu/410_disambi_infobox.csv", 'r', encoding='UTF-8') as inf: lines = inf.readlines() f = open("./410_baidu/410_disambi_infobox_out.csv", "w", encoding='utf-8') list_attr = [] title_list = get_word_list("./410_baidu/410_title.csv") err_count = 0 counts = {} for line in tqdm(lines): words = line.strip().split(",")
# Import own files from clean import Clean import vb_encoder # Global definitions csv.field_size_limit(2 ** 30) NUM_DOCS = 17153 # for progress bar purposes only COURT_RANKINGS = { 3: ['sg court of appeal', 'sg privy council', 'uk house of lords', 'uk supreme court', 'high court of australia', 'ca supreme court'], 2: ['sg high court', 'singapore international commercial court', 'hk high court', 'hk court of first instance', 'uk crown court', 'uk court of appeal', 'uk high court', 'federal court of australia', 'nsw court of appeal', 'nsw court of criminal appeal', 'nsw supreme court'] } # Create instances of imported classes cleaner = Clean() def usage(): print( "Usage: " + sys.argv[0] + " -i dataset-file -d dictionary-file -p postings-file" ) # Writes out the total number of documents in the collection to the postings file # This is basically N, to compute inverse document frequency def write_collection_size_to_disk(collection_size: int, out_postings): # Open our postings file f_postings = open(out_postings, "wb")
def cleaning(self): self.__clear__() new_clean = Clean() new = Menu()
import pandas as pd #import numpy as np #org_data = pd.read_excel('Sharma_2018-07-24.xlsx', usecols='E:AJ') #org_data= pd.read_excel('../data/all_data.xlsx', usecols='E:AJ') #backup_data=org_data from clean import Clean clean = Clean(org_data) clean.clean_data() org_data = clean.org_data from imputation import Impute impute = Impute(org_data) impute.impute_data() org_data = impute.org_data
#!/usr/bin/env python # coding=utf-8 from collections import defaultdict from clean import Clean from tqdm import tqdm with open("./410_baidu/410_disambi_subject.csv") as inf: lines = inf.readlines() # all_subject = defaultdict(list) total_subject = [] f = open("./410_baidu/disambi_subject.csv", "w") for line in tqdm(lines): words = line.strip().split(",") disambi = Clean.clean_word(words[0].decode('utf-8'), clean_level='disambi').encode("utf-8") subjects = words[1:] subjects = [ Clean.clean_word(s.decode('utf-8'), clean_level="subject").encode("utf-8") for s in subjects ] # subjects = [s.replace("\"", "").strip("\\") for s in subjects] # subjects = [s.strip() for s in subjects] total_subject.extend(subjects) for subject in subjects: if subject == "": continue f.write("\"" + disambi + "\",\"" + subject + "\"\r\n") # all_subject[disambi].append(subjects) f.close()
from collections import defaultdict from clean import Clean from tqdm import tqdm import csv with open("source/disambi_topic.csv") as in_f, open("target/disambi_topic.csv", "w") as out_f: reader = csv.reader(in_f) total_topic = [] for line in tqdm(reader): #print(line) disambi = line[0] topics = [] for i in line[1].split(','): topics.extend(i.split()) disambi = Clean.clean_word(disambi, clean_level='disambi') topics = [Clean.clean_word(s, clean_level="topic") for s in topics] total_topic.extend(topics) for topic in topics: if topic == "": continue if '[朱槿品种]' in disambi: disambi = '快乐[[朱槿品种]]' if '"' in disambi: disambi = disambi.replace('"', '""') if '"' in topic: topic = topic.replace('"', '""') out_f.write("\"" + disambi + "\",\"" + topic + "\"\n") total_topic = list(set(total_topic)) print("Total topics:%d " % (len(total_topic)))
#!/usr/bin/env python # coding=utf-8 import re from clean import Clean from tqdm import tqdm with open("./410_baidu/410_disambi.csv") as inf: title_dict = {} count = 0 lines = inf.readlines() for line in tqdm(lines): words = line.strip().split("\",\"") if len(words) != 4: count += 1 clean_disambi = Clean.clean_word(words[0].decode('utf-8'), 'disambi') title_dict[clean_disambi] = words[1:] print "Error lines: ", count with open("./410_baidu/410_disambi_new.csv", "w") as ouf: for i in title_dict.keys(): ouf.write("\"" + i.encode('utf-8') + "\",\"" + "\",\"".join(title_dict[i]) + "\r\n")
def clean(sentence): return Clean(sentence).clean_html_and_js_tags().clean_text().text
#!/usr/bin/env python # coding=utf-8 from collections import defaultdict from clean import Clean from tqdm import tqdm with open("./410_baidu/410_disambi_subject.csv", "r", encoding='utf-8') as inf: lines = inf.readlines() # all_subject = defaultdict(list) total_subject = [] f = open("./410_baidu/disambi_subject.csv", "w", encoding='utf-8') for line in tqdm(lines): words = line.strip().split(",") disambi = Clean.clean_word(words[0], clean_level='disambi') subjects = words[1:] subjects = [ Clean.clean_word(s, clean_level="subject") for s in subjects ] # subjects = [s.replace("\"", "").strip("\\") for s in subjects] # subjects = [s.strip() for s in subjects] total_subject.extend(subjects) for subject in subjects: if subject == "": continue f.write("\"" + disambi + "\",\"" + subject + "\"\r\n") # all_subject[disambi].append(subjects) f.close() total_subject = list(set(total_subject)) print("Total subjects: ", len(total_subject)) with open("./410_baidu/all_subject.csv", "w", encoding='utf-8') as ouf: