def init_tcex(requires_tc_token=False): """Initialize the tcex instance.""" tcex_instance = tcex.TcEx() tcex_instance.log.debug('Creating content in {}. If this is not correct, pass in a different owner name using the --api_default_org flag.'.format(tcex_instance.args.api_default_org)) tcex_instance.args.api_access_id = os.environ['API_ACCESS_ID'] tcex_instance.args.tc_temp_path = 'log' # this manually sets the logging level tcex_instance.log.setLevel(logging.DEBUG) tcex_instance.args.tc_log_path = 'log' tcex_instance.args.tc_out_path = 'log' tcex_instance.args.tc_api_path = os.environ['TC_API_PATH'] tcex_instance.args.api_default_org = os.environ['API_DEFAULT_ORG'] tcex_instance.args.api_secret_key = os.environ['API_SECRET_KEY'] if requires_tc_token: if os.environ.get('TC_TOKEN'): tcex_instance.args.tc_token = os.environ['TC_TOKEN'] # parse the expiration timestamp from the tc_token tcex_instance.args.tc_token_expires = tcex_instance.args.tc_token.split(':')[4] # if the request requires a token and a token is not found, raise an error else: raise RuntimeError('The TC_TOKEN environmental variable is required and was not found. Please add it (you can find instructions for doing so here: https://gitlab.com/fhightower-tc/tcex-playground#setup).') # clear out any data in the source cleaner.clean(tcex_instance) validator.validate(tcex_instance, expected_groups=0, expected_indicators=0) return tcex_instance
def post(self): if(ifConnected == 1): bl.quit_mesh() conn = db_connect.connect() query = conn.execute("delete from devices;") cleaner.clean() return {'status' : 'OK'}
def main(): creds = config.get_creds() sftp.download(creds.get("sftp_url"), creds.get("sftp_username"), creds.get("sftp_password"), creds.get("localpath")) cleaner.clean(creds.get("localpath")) merge.merge(creds.get("localpath")) scraper.scrape(creds)
def get(self, chinese): input = [chinese, ''] input = clean(input) prepare = vectorizer.transform(clean(input)) pred = clf.predict(prepare) level = np.asscalar(np.int16(pred[0])) result = {chinese: level} return result
def logistic_regression(): if request.method == 'POST': # if user presses submit after uploading dataset and target if 'file' in request.files and 'target' in request.form: file = request.files['file'] session['filename'] = file.filename data = pd.read_csv(file) data = cleaner.clean(data) session['target'] = request.form['target'] session['target'] = cleaner.fix_target(session['target']) # if user only needs to uplaod target and presses submit elif 'get_target' in request.form: session['target'] = request.form['get_target'] session['target'] = cleaner.fix_target(session['target']) data = pd.read_pickle(session['filename']) # perform linear regression logreg_model = logreg.logreg(data, session['target']) return render_template('linearreg.html') # if user needs to uplaod csv and target if request.method == 'GET': if not os.path.isfile(session['filename']): return render_template('linearreg.html') else: # if user only needs to upload target return render_template('linearreg.html', get_target=True)
def main(): config = parse_config(sys.argv) results = do_search(config) if len(results) == 0: print('No results found.') sys.exit(1) elif len(results) == 1: result = results[0] else: result = do_disambiguate(results) filename = do_download(result) print('Saved pronuncuation to ./{}'.format(filename)) if config['clean']: print('Cleaning..') username = result['standard_pronunciation']['username'] profile = cleaner.find_noise_profile(username) if profile is None: print('No noise profile exists for {}. ' 'We will try to create one.'.format(username)) cleaned_filename, new_profile = cleaner.clean(filename, username, noise_profile=profile) if profile is None: if new_profile is None: print('Noise profile creation aborted.') else: print('Saved new profile to {}'.format(new_profile)) print('Cleaned pronunciation saved to ./{}'.format(cleaned_filename))
def make(sourcefile, modulename): import cleaner, preprocess if not os.access(sourcefile, os.F_OK): raise IOError(sourcefile) #sourcefile basename = os.path.basename(sourcefile) preprocessed = "%s.c"%(modulename) cleaned = "%s_clean.c"%(modulename) #xml = "%s.xml"%(modulename) pyfinal = "%s.py"%(modulename) if not os.access(pyfinal, os.F_OK): if not os.access(cleaned, os.F_OK): if not os.access(preprocessed, os.F_OK): # preprocess the file if preprocess.process(sourcefile, preprocessed) > 0: return log.info('PREPROCESS - OK') # clean it if cleaner.clean(preprocessed, cleaned) > 0: return log.info('CLEAN - OK') # generate yfinal if gen(cleaned, modulename) > 0: return log.info('PYFINAL - OK') __import__(modulename) import inspect nbClass = len(inspect.getmembers(sys.modules[modulename], inspect.isclass)) nbMembers = len(inspect.getmembers(sys.modules[modulename], inspect.isclass)) log.info("module %s has %d members for %d class"%(modulename, nbMembers, nbClass))
def test_clean__rise_exception_where_no_closing_tag(): def mock_walk(path: str): return [ ('root_dir', [], ['mango.py']), ] line_processor = lambda line: None path = '.' with patch('builtins.open', mock_open(read_data=FILE_WITHOUT_CLOSING_TAG)), \ patch('os.walk', mock_walk), \ patch('config.FILE_EXTENSIONS_TO_PROCESS', {'.py', '.js'}), \ patch('cleaner.save_file', MagicMock()) as save_file_mock, \ patch('config.OPENING', '# ▼▼▼ MY TEMP CODE. DELETE ME ▼▼▼'), \ patch('config.ENDING', '# ▲▲▲ MY TEMP CODE. DELETE ME ▲▲▲'), \ pytest.raises(cleaner.ClosingTagNotFoundException): cleaner.clean(path, line_processor) save_file_mock.assert_not_called()
def main(input_filepath, output_filepath): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ logger = logging.getLogger(__name__) logger.info('making final data set from raw data') data = clean(input_filepath) data.to_csv(output_filepath, index=False)
def generate(source='articles/cnbeta'): 'combines cleaner and segmenter' import cleaner, segmenter documents = [] items = cleaner.clean(source) documents = segmenter.segment(items) publish(documents, source) return documents
def load_docs(filepath, clean_text=True): ret = [] for f_name in os.listdir(filepath): if clean_text: ret.append( clean( open(filepath + "/" + f_name, 'rb').read().decode('UTF-8'))) continue ret.append(open(filepath + "/" + f_name, 'rb').read().decode('UTF-8')) return ret
def recommendations(): s = session() rows = s.query(News).filter(News.label == None).all() classified_news = [] for i in rows: prediction = model.predict(clean(i.title)) for j in range(len(prediction)): if prediction[j] == 'good': classified_news.append(i) else: break return template('news_recommendations', rows=classified_news)
def test_clean(line_processor_name, file_data, expected_result): def mock_walk(path: str): return [ ('root_dir', [], ['mango.py']), ] line_processor = cleaner.LINE_PROCESSORS[line_processor_name] path = '.' with patch('builtins.open', mock_open(read_data=file_data)), \ patch('os.walk', mock_walk), \ patch('cleaner.save_file', MagicMock()) as save_file_mock, \ patch('cleaner.FILE_EXTENSIONS_TO_PROCESS', {'.py', '.js'}), \ patch('cleaner.OPENING', '# ▼▼▼ MY TEMP CODE. DELETE ME ▼▼▼'), \ patch('cleaner.ENDING', '# ▲▲▲ MY TEMP CODE. DELETE ME ▲▲▲'): cleaner.clean(path, line_processor) expected = [l + '\n' for l in expected_result.split('\n') ][:-1] # Split wihotut deleting delimiter assert (save_file_mock.call_args_list == [ call('root_dir/mango.py', expected) ])
def test_clean__include_only_defined_extensions_and_exclude_dirs(): def mock_walk(path: str): return [ ('root_dir', [], ['mango.py', 'bannana.js', 'raspberry.html', 'lenmon.txt']), ('test/excluded_dir', [], ['blackberry.py', 'cherry.js']), ] line_processor = lambda line: None path = '.' with patch('builtins.open', mock_open(read_data='Some data about fruits, irrelevant')) as mocked_open, \ patch('os.walk', mock_walk), \ patch('cleaner.FILE_EXTENSIONS_TO_PROCESS', {'.py', '.js'}), \ patch('cleaner.EXCLUDE_DIRS', {'excluded_dir'}): cleaner.clean(path, line_processor) assert mocked_open.call_args_list == [ call('root_dir/mango.py', 'r'), call('root_dir/mango.py', 'w'), call('root_dir/bannana.js', 'r'), call('root_dir/bannana.js', 'w'), ]
def read_emails(path): files = [f for f in listdir(path) if isfile(join(path, f))] try: del (files[files.index('.DS_Store')]) except: pass reader = WordListCorpusReader(path, files) text = clean(reader.raw()) emails = split_emails(text, reader.fileids()) return emails
def test_indicator_associations(): """.""" tcex = utility.init_tcex() tcex.jobs.indicator({ "summary": "4.5.6.7", "type": "Address", }) tcex.jobs.indicator({ "summary": "ASN1234", "type": tcex.safe_rt('ASN', lower=False), }) tcex.jobs.association({ 'association_value': 'ASN1234', 'association_type': tcex.safe_rt('ASN', lower=False), 'resource_value': '4.5.6.7', 'resource_type': 'Address' }) tcex.jobs.process(tcex.args.api_default_org) assert len(tcex.jobs.indicator_results['failed']) == 0 assert len(tcex.jobs.indicator_results['not_saved']) == 0 assert len(tcex.jobs.indicator_results['saved']) == 2 verify_association_created(tcex) cleaner.clean()
def main(): import json, sys,cleaner, glob with open('brown_output', 'w') as w_fh: lyrics_files = glob.glob('*.lyrics') for each_lyr in lyrics_files: for each_l in open(each_lyr): (title, js) = each_l.strip().split('\t') list_lyrics = json.loads(js) if list_lyrics is not None and len(list_lyrics) > 0: lyr = cleaner.clean(list_lyrics[0]) for each_lyrline in lyr: to_append = ' '.join(each_lyrline.strip()).encode('utf-8' '') w_fh.write(to_append + '\n') return
def linearRegression(): if request.method == 'POST': # if user presses submit after uploading dataset and target if 'file' in request.files and 'target' in request.form: file = request.files['file'] session['filename'] = file.filename data = pd.read_csv(file) data = cleaner.clean(data) session['target'] = request.form['target'] session['target'] = cleaner.fix_target(session['target']) # if user only needs to uplaod target and presses submit elif 'get_target' in request.form: session['target'] = request.form['get_target'] session['target'] = cleaner.fix_target(session['target']) data = pd.read_pickle(session['filename']) # perform linear regression linreg_model = linreg.linreg(data, session['target']) ans = '' eq = g.selected_features + ' * ' + str(linreg_model.coef_[0]) if np.sign(linreg_model.coef_) > 0: ans = '+ ' + eq else: ans = '- ' + eq return render_template('linearreg.html', intercept=linreg_model.intercept_, coef_name=g.selected_features, coef_num=linreg_model.coef_, r_squared=g.r_squared, mae=g.mae, eq=ans) # if user needs to uplaod csv and target if request.method == 'GET': if not os.path.isfile(session['filename']): return render_template('linearreg.html') else: # if user only needs to upload target return render_template('linearreg.html', get_target=True)
def eda(): if request.method == 'GET': return render_template('eda.html') if request.method == 'POST' and 'file' in request.files: file = request.files['file'] session['filename'] = file.filename data = pd.read_csv(file) data = cleaner.clean(data) data.to_pickle(session['filename']) return render_template('eda.html', tables=[data.head().to_html()], nulls=g.nulls, duplicates=g.duplicates, outliers=g.outliers, memory=g.memory)
def process(tweet,relevance): reload(sys) sys.setdefaultencoding('utf8') text=tweet.text.encode('utf-8',errors='ignore') sentences = nltk.sent_tokenize(text) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False) entity_names=entity_locs=entity_per=[] for tree in chunked_sentences: entity_names.extend(extract_entity_names(tree)) entity_locs.extend(extract_entity_logs(tree)) entity_per.extend(extract_entity_pers(tree)) a=Scholarship() a.tweet_id=tweet.id a.created_at=tweet.created_at a.tweet_lang=tweet.lang a.user_name=tweet.user.screen_name if(tweet.place!=None): a.user_country=tweet.place.country a.text=clean(tweet.text) a.scholarship_name=return_scholarship_name(entity_names,entity_locs,entity_per) a.university=return_university_name(text) a.deadline=return_deadline(text) a.category=return_category(text) a.info_url=return_urls(tweet) context=find_place(a.university,entity_locs) a.country=return_country(context['lng'],context['lat']) a.longitude=context['lng'] a.latitude=context['lat'] if(a.scholarship_name!=None and a.longitude!=None and a.latitude!=None): a.markerName=preprocess_str(a.scholarship_name)+ ' , '+context['place'] a.markerType =context['type'] else: a.markerType=a.markerName=None a.relevant =relevance try: a.save() except Exception as e: print "there is a problem ",e
def main(caption): model = Word2Vec.load('w2v/word2vec.bin') text = clean(caption, lemmatize=True, stop_words=True) sw = set(stopwords.words('English')) hashtags = [] for i in text: try: hashtag = model.wv.most_similar(i) except KeyError: hashtag = False if hashtag: for tags, score in hashtag: if tags not in punctuation and tags not in sw and len( tags) > 2: tag = f"#{tags}" hashtags.append(tag) print(hashtags)
def extractDoc(ext): root = 'data' data = [] for f in os.listdir(os.path.join(root, ext))[:5]: with open(os.path.join(root, ext, f), 'r') as sc: sc = clean(sc.read(), 'cpp') data.append(sc) print "[SUCCESS] Read", os.path.join(root, ext, f) vectorizer = TfidfVectorizer(tokenizer=tokenize, ngram_range=(1,2)) X = vectorizer.fit_transform(data) del data features_by_gram = defaultdict(list) for f, w in zip(vectorizer.get_feature_names(), vectorizer.idf_): features_by_gram[len(f.split(' '))].append((f, w)) top_n = 50 for gram, features in features_by_gram.iteritems(): top_features = sorted(features, key=lambda x: x[1], reverse=True)[:top_n] top_features = [f[0] for f in top_features] print '{}-gram top:'.format(gram), top_features
def train(): f = open(sys.argv[1], "r") corpus = [] target = [] failed = 0 for line in f: try: pieces = line.split(DELIMITER) pieces = map(lambda x : x[1:len(x) - 1], pieces) corpus.append(clean(pieces[5])) target.append(int(pieces[0])) except: failed += 1 X = VECTORIZER.fit_transform(corpus) # save vectorizer. with open('vectorizer.pkl', 'wb') as fid: cPickle.dump(VECTORIZER, fid) # save logistic regression model. logreg = linear_model.LogisticRegression(C=1e5) logreg.fit(X, np.array(target)) with open('logreg.pkl', 'wb') as fid: cPickle.dump(logreg, fid)
def preprocess(): vec = CountVectorizer() data = clean('lyrics.txt') vec.fit_transform([i[1] for i in data]) shuffle(data) data, d2 = tts(data, test_size=0.1) data = chunkify(data, 10) lyrics = [] for d in data: temp = [[], []] for item in d: temp[0].append(item[1]) temp[1].append(item[0]) temp[0] = vec.transform(temp[0]).toarray().tolist() lyrics.append(temp) test = [[], []] for item in d2: test[0].append(item[1]) test[1].append(item[0]) test[0] = vec.transform(test[0]).toarray().tolist() return [lyrics, test]
def make(sourcefile, modulename, target=False): ''' using gccxml directly distort ctypeslib performances but on some libraries, we don't have a choice. ''' if not os.access(sourcefile, os.F_OK): raise IOError(sourcefile) #sourcefile basename = os.path.basename(sourcefile) preprocessed = "%s.c"%(modulename) cleaned = "%s_clean.c"%(modulename) xml = "%s.xml"%(modulename) pyfinal = "%s.py"%(modulename) if target: gen2(sourcefile, modulename, target) log.info('PYFINAL - OK') else: if not os.access(pyfinal, os.F_OK): if not os.access(cleaned, os.F_OK): if not os.access(preprocessed, os.F_OK): # preprocess the file if preprocess.process(sourcefile, preprocessed) > 0: return log.info('PREPROCESS - OK') # clean it if cleaner.clean(preprocessed, cleaned) > 0: return log.info('CLEAN - OK') # generate yfinal if gen(cleaned, modulename) > 0: return log.info('PYFINAL - OK') __import__(modulename) import inspect nbClass = len(inspect.getmembers(sys.modules[modulename], inspect.isclass)) nbMembers = len(inspect.getmembers(sys.modules[modulename], inspect.isclass)) log.info("module %s has %d members for %d class"%(modulename, nbMembers, nbClass))
# param 2 : output file path #################### # Examples of use : # # To download 1000 questions with potential answers with java tag : # python stackoverflow.py sdd java 3 javarawdump.json # # To clean data : # python stackoverflow.py c javarawdump.json javacleaneddump.json import sys from cleaner import clean from downloader import download try: mode = sys.argv[1] if mode == 'sdd': #simple data downloader tag = sys.argv[2] amount = int(sys.argv[3]) output = sys.argv[4] download(tag, amount, output) elif mode == 'c': #clean data input = sys.argv[2] output = sys.argv[3] clean(input, output) elif mode == 'add': print 'not supported yet' else: print 'Not supported work mode' except: print 'Unspecified error occured ', sys.exc_info()[0], sys.exc_info()[1]
def summarize(text): from nltk.corpus import stopwords from nltk.tokenize import word_tokenize, sent_tokenize from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans import cleaner words = word_tokenize(text) sentences = sent_tokenize(text) stop_words = set(stopwords.words("english")) f = open("stopwords.txt") for stops in f.read().split(): stop_words.add(stops) #print(sentences) vectorizer = TfidfVectorizer(stop_words='english') X = vectorizer.fit_transform(sentences) true_k = 2 model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1) model.fit(X) c1 = list() c2 = list() order_centroids = model.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(true_k): #print ("Cluster %d:" % i) for ind in order_centroids[i, :10]: if (i == 0): c1.append(terms[ind]) else: c2.append(terms[ind]) #print("\n") #print("\n") #print("\n") #print("Cluster 1 :") #print(c1) #print("\n") #print("Cluster 2 : ") #print(c2) sentence_score = {} sc = 1.0 for sentence in sentences: sc = 1.0 for word in c1: #print("\n* "+ word) if word in sentence.lower(): if sc <= 0: sc = 0 if sentence in sentence_score.keys(): sentence_score[sentence] += sc sc = sc - 0.05 #print(sentence_score[sentence]) else: sentence_score[sentence] = sc sc = sc - 0.05 #print(sentence_score[sentence]) #print(sentence_score) sum_total = 0 for sentence in sentences: if (sentence in sentence_score.keys()): sum_total += sentence_score[sentence] #print(sum_total) #print("Sum total : " + str(sum_total)) average_score = int(sum_total / len(sentence_score)) #print("Average = "+str(average_score)) summary = "" #change the value have more fun! for sentence in sentences: #print(sentence) if sentence in sentence_score.keys( ) and sentence_score[sentence] > 2.6 * average_score: summary += "" + cleaner.clean(sentence) + "\n\n" print(summary) sentence_score2 = {} for sentence in sentences: sc = 1.0 for word in c2: #print("\n* "+ word) if word in sentence.lower(): if sc <= 0: sc = 0 if sentence in sentence_score2.keys(): sentence_score2[sentence] += sc sc = sc - 0.05 #print(sentence_score[sentence]) else: sentence_score2[sentence] = sc sc = sc - 0.05 #print(sentence_score[sentence]) #print(sentence_score) sum_total = 0 for sentence in sentences: if (sentence in sentence_score2.keys()): sum_total += sentence_score2[sentence] #print(sum_total) #print("Sum total : " + str(sum_total)) average_score = int(sum_total / len(sentence_score2)) #print("Average = "+str(average_score)) summary = "" #change the value have more fun! for sentence in sentences: #print(sentence) if sentence in sentence_score2.keys( ) and sentence_score2[sentence] > 2.6 * average_score: summary += "" + cleaner.clean(sentence) + "\n\n\n" return (summary)
import twitter_stream import cleaner if __name__ == '__main__': #raw_data file path raw_data = 'data/twitter_data.json' while True: ##data retrival twitter_stream.listen(raw_data) ##data cleaning european = cleaner.clean(raw_data) #insert cleaner.insert(european) ##ereasing file content f = open(raw_data, 'wr') f.close()
from qrGenerator import generate from qrGUI import show from cleaner import clean import easygui print("SENDER starts working") print("Generate QR codes") generate(easygui.fileopenbox(default="../img/*")) print("QR codes ready") print("Sending data") show() clean()
best_params = grid_result.best_params_ print('best_params are:', best_params) rfr = RandomForestRegressor(max_depth=best_params["max_depth"], n_estimators=210, random_state=False, verbose=False) # Perform K-Fold CV rfr.fit(X, y) #scores = cross_val_score(rfr, X, y, cv=10, scoring='neg_mean_absolute_error') #return [scores,rfr] return rfr s = 'DataSets/Train.csv' df = clean(s) print(df.head()) Y = df['traffic_volume'].values X = df.drop(['date_time', 'traffic_volume', 'dew_point'], axis=1) x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42) rfr = rfr_model(X, Y) #print('scores are:',scores) y_pred = rfr.predict(x_test) err = mean_squared_error(y_test, y_pred) err_log = mean_squared_log_error(y_test, y_pred)
data = pd.read_csv(path_training_data, header=None, names=cols, encoding='latin-1') print('_______________________________________________________') #print(len(data)) #print(data.head()) data.drop(['id', 'date', 'query_string', 'user'], 1, inplace=True) X = data['text'] print(type(X)) print(X[:5]) y = data['sentiment'] clean_training_data = cleaner.clean(X) #print('_______________________________________________________') print(len(X)) print(y[:250]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.02, random_state=0) print(99999999999999999999) vectorizer = CountVectorizer(ngram_range=(1, 2)) training_features = vectorizer.fit_transform(X_train) print(8888888888888888888888)
def main(): # parse the files c = parse_css() h = parse_html() unused, undefined, fileNames, css = {}, {}, [], [] results, fileCount = '', 0 # identify UNUSED classes for cla, num in c[0][0].items(): x = cla.split() # no psuedoclasses allowed if ':' not in x[0]: css.append(x[0]) if x[0] not in h[0][0][0]: unused[cla] = num # identify UNUSED IDs for ID, num in c[1][0].items(): y = ID.split() # no psuedoclasses allowed if ':' not in y[0]: css.append(y[0]) if y[0] not in h[0][1][0]: unused[ID] = num i = f'Identified {c[0][1]} unique classes and {c[1][1]} unique IDs.\n' print('\n' + i) results += i # identify UNDEFINED classes and IDs for d in h[0]: for dd in d: for rule, file in dd.items(): if rule not in css: undefined[rule] = file pre = 'ID: ' if rule[0] == '#' else 'class:' o = f'Undefined {pre} {rule} : {file}' print(o) results += '\n' + o print() results += '\n' # copy to allow deleting final = dict(unused) # identify pseudoclasses for rule, num in unused.items(): z = rule.split() r, fn = z[0], z[2] # get filenames if fn not in fileNames: fileNames.append(fn) fileCount += 1 # if pseudoclass if ':' in r: rr = r.split(':')[0] # if rule exists and isn't unused if rr in css and rr + ' : ' + fn not in unused: del final[rule] continue o = '' if z[0][0] == '.': o = f'Unused class: {rule}{num}' elif z[0][0] == '#': o = f'Unused ID: {rule}{num}' print(o) results += '\n' + o # update dict with full css filepaths fullFilePairs = updateFilePaths(fileNames, h[1]) # predefined in case file is already clean q, qq = 'no', 'no' if not final: o = 'No unused classes nor IDs!' print(o) results += i if final: # may i clean? q = input( '\nMay I remove these unused rules and output new .css files? (yes/no): ' ) if q.lower() in ('yes', 'y'): clean(final, fileNames, fileCount) if not undefined: o = 'No undefined classes nor IDs!' print(o) results += i if undefined: # may i define? qq = input('May I add definitions for undefined rules? (yes/no): ') if qq.lower() in ('yes', 'y'): define(undefined, fullFilePairs) # no cleaning, but maybe a humble .txt file? if q.lower() in ('no', 'n') and qq.lower() in ('no', 'n'): qqq = input( 'Would you instead like a .txt file with your results? (yes/no): ') if qqq.lower() in ('yes', 'y'): with open('results.txt', 'w') as f: f.write(results) print('Wrote results.txt') elif qqq.lower() in ('no', 'n'): exit('Thank you.') else: exit('Invalid response.') else: exit('Thank you.')
if s.query(News).filter(News.title == i['title'], News.author == i['author']).first(): break else: s.add(News(**i)) s.commit() redirect("/news") @route('/recommendations') def recommendations(): s = session() rows = s.query(News).filter(News.label == None).all() classified_news = [] for i in rows: prediction = model.predict(clean(i.title)) for j in range(len(prediction)): if prediction[j] == 'good': classified_news.append(i) else: break return template('news_recommendations', rows=classified_news) if __name__ == '__main__': s = session() rows = s.query(News).filter(News.label != None).all() X_train = [clean(row.title) for row in rows] y_train = [row.label for row in rows] model = NaiveBayesClassifier() model.fit(X_train, y_train) run(host="localhost", port=8080)
} } TYPES = ["dev", "test", "train"] if __name__ == '__main__': IN_DIR = sys.argv[1] OUT_DIR = sys.argv[2] data = {} try: os.mkdir(IN_DIR) except OSError: pass for t in TYPES: for country, dataset in DATASET_FILES.items(): reader = Reader(IN_DIR + '/' + dataset[t]) data[t] = list(zip(reader.y(), reader.X())) with open('{}/{}.tsv'.format(OUT_DIR, t), 'w') as out: csv_out = csv.writer(out, delimiter='\t') for row in data[t]: sentiment, text = row text = clean(text) if sentiment: csv_out.writerow([sentiment, text]) else: csv_out.writerow([text])
all = True if all: ratio = 25 # training to test set train_corpus = main_corpus[:(ratio * len(main_corpus) // (ratio + 1))] train_corpus_target = main_corpus_target[:(ratio * len(main_corpus) // (ratio + 1))] test_corpus = main_corpus[(len(main_corpus) - (len(main_corpus) // (ratio + 1))):] test_corpus_target = main_corpus_target[(len(main_corpus) - len(main_corpus) // (ratio + 1)):] else: from cleaner import clean train_corpus = main_corpus train_corpus_target = main_corpus_target test_corpus = clean(["我叫李明,应聘销售经理。是王小姐让我下午两点半来面试的。", "你比他大一岁"]) test_corpus_target = [5, 2] # size of datasets train_corpus_size_mb = size_mb(train_corpus) test_corpus_size_mb = size_mb(test_corpus) print("%d documents - %0.3fMB (training set)" % (len(train_corpus_target), train_corpus_size_mb)) print("%d documents - %0.3fMB (test set)" % (len(test_corpus_target), test_corpus_size_mb)) print("%d categories" % len(my_categories)) print() print( "Extracting features from the training data using a sparse vectorizer...")