def create_corpus_from_web(url_file, raw=False): raw_sp2txt = {} proc_sp2txt = {} speech_titles = {} U = open(url_file) url_list = [url.strip() for url in U.readlines()] for doc_index, url in enumerate(url_list): pprint.pprint('Grabbing URL: ' + str(url)) article = grab_link(url) if not (article and article.cleaned_text and article.title): pprint.pprint('Skipping. No content in URL: ' + url) continue title = unidecode.unidecode_expect_nonascii(article.title) speech_titles[doc_index] = title _raw_input = article.cleaned_text text = unidecode.unidecode_expect_nonascii(_raw_input) re.sub("[\W\d]", " ", text.lower().strip()) lowers = text.replace('\n',' ').replace('\r',' ') while " " in lowers: lowers = lowers.replace(' ',' ') ''' store raw text -- for sentence extraction ''' raw_sp2txt[doc_index] = lowers ''' store no_punctuation for NMF ''' no_punctuation = lowers.translate(None, string.punctuation) proc_sp2txt[doc_index] = no_punctuation U.close() return proc_sp2txt, raw_sp2txt, speech_titles
def create_corpus_from_html(raw_html_path, raw=False): raw_sp2txt = {} proc_sp2txt = {} speech_titles = {} for subdir, dirs, files in os.walk(raw_html_path): for doc_index, each_file in enumerate(files): file_path = subdir + os.path.sep + each_file htmlfile = open(file_path, 'r') raw_content = htmlfile.read() article = Goose().extract(raw_html=raw_content) if not (article and article.cleaned_text and article.title): continue #print 'Processing article: ', article.title speech_titles[doc_index] = unidecode.unidecode_expect_nonascii(article.title) text = unidecode.unidecode_expect_nonascii(article.cleaned_text) re.sub("[\W\d]", " ", text.lower().strip()) lowers = text.replace('\n',' ').replace('\r',' ') while " " in lowers: lowers = lowers.replace(' ',' ') ''' store raw text -- for sentence extraction ''' raw_sp2txt[doc_index] = lowers ''' store no_punctuation for NMF ''' no_punctuation = lowers.translate(None, string.punctuation) proc_sp2txt[doc_index] = no_punctuation htmlfile.close() return proc_sp2txt, raw_sp2txt, speech_titles
def paste(): #num1 = randint(1, 10) #num2 = randint(1, 10) if request.method == 'POST': if request.form['email_add']: return render_template('404.html') #if (request.form['userSolution'] != request.form['sum']): # return redirect(request.url) result = request.form if 'plaintext' not in result: flash('No text submitted') # does not work yet return redirect(request.url) if 'author' not in result: return redirect(request.url) prose = unidecode_expect_nonascii(result['plaintext']) apis = result['apis'] author = result['author'] administrator = result['administrator'] admin_notes = result['notes'] global Doc Doc = read_document.Sample(prose, author, apis) Doc.administrator = administrator Doc.admin_notes = admin_notes Doc.timestamp = datetime.now() return redirect(url_for('feedback', timestamp=Doc.timestamp)) return render_template('paste.html')
def parse_pdf_using_slate(filepath): """ Parses the PDF file and returns its text in json format. :input: filepath: PDF file path which you want parse. :output: returns and creats a json file containg the text from PDF. """ all_questions = [] with open(filepath) as f: document = slate.PDF(f) count = 0 for each_page in document: questions = re.split(r"\d+[.]", unidecode.unidecode_expect_nonascii(each_page)) for each in questions: # print each splited_question = each.split("(A") count = count + 1 try: question_dict = {} if splited_question[0] != "": question_dict["question_statement"] = splited_question[0] question_dict["answer_options"] = "(A" + splited_question[1] all_questions.append(question_dict) except: pass # clock 9.50 pattern fails with open("data/parsed_questions.json", "w") as q_file: json.dump(all_questions, q_file, indent=4) return all_questions
async def on_member_update(before, after): g = after.guild isascii = lambda s: len(s) == len(s.encode()) if after.display_name.startswith( tuple(chars)): # BEGIN AUTO DEHOIST MEME exists = (lambda: list( r.table('settings').filter(lambda a: a['guild'] == str( g.id)).run(self.conn)) != [])() if not exists: return settings = list( r.table('settings').filter( lambda a: a['guild'] == str(g.id)).run(self.conn))[0] if 'auto_dehoist' in settings.keys(): if settings['auto_dehoist']: try: await after.edit( nick= f'{dehoist_char}{after.display_name[0:31]}', reason='[Automatic dehoist]') except discord.Forbidden: return if isascii( after.display_name ) == False and not after.display_name.startswith(dehoist_char): exists = (lambda: list( r.table('settings').filter(lambda a: a['guild'] == str( g.id)).run(self.conn)) != [])() if not exists: return settings = list( r.table('settings').filter( lambda a: a['guild'] == str(g.id)).run(self.conn))[0] if 'auto_decancer' in settings.keys(): if settings['auto_decancer']: aaa = unidecode.unidecode_expect_nonascii( after.display_name) if len(aaa) > 32: aaa = aaa[0:32 - 3] + '...' try: await after.edit(nick=aaa, reason='[Automatic decancer]') except discord.Forbidden: return if before.roles == after.roles: return if len(before.roles) < len(after.roles): return # they had a role removed from them if after.roles == [after.guild.default_role]: # no roles; should be after a manual untoss try: if self.rolebans[after.id][after.guild.id] in [None, []]: return # they weren't rolebanned await after.edit( roles=self.rolebans[after.id][after.guild.id], reason='[Manual role restore]') self.rolebans[after.id][after.guild.id] = None except KeyError or discord.Forbidden: return
def _normalize_string(self, string): ret_string = '' for char in string: if re.match(u'[Α-Ωα-ωÅ]', char) is not None: ret_string += str(char) else: ret_string += str(unidecode_expect_nonascii(str(char))) return ret_string
def _normalize_string(self, string): ret_string = '' for char in string: if re.match('[Α-Ωα-ωÅ]', char) is not None: ret_string += char else: ret_string += unidecode_expect_nonascii(char) return ' '.join(ret_string.split())
async def decancer(self, ctx, member: discord.Member): '"Decancer" a member, or strip all the non-ASCII characters from their name. Useful to make your chat look good.' if ctx.author.permissions_in(ctx.channel).manage_nicknames: cancer = member.display_name decancer = unidecode.unidecode_expect_nonascii(cancer) # decancer = re.sub(r'\D\W', '', decancer) if len(decancer) > 32: decancer = decancer[0:32 - 3] + "..." await member.edit(nick=decancer) await ctx.send( f'Successfully decancered {cancer} to ​`{decancer}​`.') else: cancer = member.display_name decancer = unidecode.unidecode_expect_nonascii(cancer) await ctx.send( f'The decancered version of {cancer} is ​`{decancer}​`.')
def convert_text(text): """ Function that converts text to unicode and checks if there are empty strings INPUT: text from the consumer complaint narratives OUTPUT: original text plus tag for empty strings """ if (text == ''): print "FOUND MISSING TEXT" return "--MISSING INFO--" else: return unidecode.unidecode_expect_nonascii(text)
def scan(app, threadnums, whitepaths, blackpaths, outputpath): global allfiles global pathq pathq = whitepaths filelist = [] try: scanpath = pathq.get() if scanpath not in blackpaths and os.path.isdir(scanpath): filelist = os.listdir(scanpath) if len(filelist) < 100000: flist = [] for file in filelist: filename = os.path.join(scanpath, file) if os.path.exists(filename): if os.path.isdir(filename): pathq.put(filename) if threading.activeCount() < threadnums: t = threading.Thread(target=scan) t.start() elif os.path.isfile(filename): lock.acquire() allfiles += 1 print(app + ':%s' % allfiles) lock.release() fileinfo = os.stat(filename) f = scanfile( fileinfo.st_type, fileinfo.st_size, fileinfo.st_mtime, fileinfo.st_atime, fileinfo.st_ctime, datetime.datetime.now(), fileinfo.st_uid, fileinfo.st_gid, unidecode.unidecode_expect_nonascii(filename)) flist.append(f) dir = scandir(scanpath, flist) with open(outputpath, 'w') as fw: fw.write(json.dumps(dir, default=lambda obj: obj.__dict__)) flist = [] else: print(scanpath + ' has: ' + str(len(filelist)) + ' files' + ', do not scan now') else: print(scanpath + 'is not ' + 'valid path') except OSError as e: print 'OSError:', e logging.info('error at: ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + 'in' + scanpath + 'OSError:' + e) except Exception as e: print 'Exception:', e logging.info('error at: ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + 'in' + scanpath + 'Exception:' + e)
def unicodeToAscii(s): #s = s.decode('utf-8') # da stringa a utf-8 if type(s) == str: s = unicode(s, 'utf-8') # da stringa a utf-8 s = unidecode_expect_nonascii(s) # da utf-8 sostituisce caratteri "strani" in stringa ASCII s = unicode(s) # ad unicode return ''.join( c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn' and c in all_letters )
def merge_csvs(): # columns to keep from scraped data cols = [ 'results_wine_name', 'results_wine_name_url', 'results_wine_reviews_name' ] # ten files initially scraped, iterate over them and merge for i in range(10): tmp = pd.read_csv('../../data/raw/first_1000/run_results' + str(i) + '.csv') tmp = tmp[cols] if 'df' not in vars(): df = tmp else: df = pd.concat([df, tmp]) # rename columns df.columns = ['wine_name', 'id', 'review_text'] # remove null values not_null = ~df['review_text'].isnull() df = df[not_null] # drop duplicates (if created by scraper) df.drop_duplicates(inplace=True) # decode unicode characters df['wine_name'] = df['wine_name'].apply( lambda x: unidecode_expect_nonascii(x)) df['review_text'] = df['review_text'].apply( lambda x: unidecode_expect_nonascii(x)) # split the wine id from the url df['id'] = df['id'].apply(lambda x: int(x.split('?iWine=')[1])) return df
def filter_tokenized_file(filename, vocab, min_len=5): sents = [] filters = { 'copyright', 'chapter', 'edition', 'license', 'licensed', 'published' } with smart_open.open(os.path.join(BOOKCORPUS_PROCESSED_DIR, filename), encoding='utf-8') as f: for line in f: sent = line.replace('\n', '') sent = sent.replace(u'\u2026', '') sent = unidecode.unidecode_expect_nonascii(sent) sent = sent.replace('``', '"') sent = sent.replace('`', '\'') sent = sent.replace('\'\'', '"') words = sent.split() if len(words) < min_len: continue num_punk = 0 num_known = 0 filter_flag = False for word in words: if word.lower() in filters: filter_flag = True break if not word.isalnum(): num_punk += 1 elif word in vocab: num_known += 1 if filter_flag: continue thresh = len(words) * 0.5 if num_punk >= thresh or num_known < 1: continue sents.append(sent) if len(sents) > 0: with smart_open.open(os.path.join(BOOKCORPUS_AUTHOR_DIR, filename), 'w', encoding='utf-8') as f: for sent in sents: f.write(sent.decode('utf-8') + '\n')
def get_unique_tweets(): conn = sqlite3.connect('twitter_project.db') c = conn.cursor() c.execute("select * from tweets") tweet_dict = {} for each in c.fetchall(): user = unidecode.unidecode_expect_nonascii(each[2][0:15]) if user[0:4] == "RT @": user = user[4:] if user in tweet_dict: tweet_dict[user] +=1 else: tweet_dict[user] = 1 for each in tweet_dict.keys(): print str(each)+": "+str(tweet_dict[each]) print "total unique tweets: "+str(len(tweet_dict.keys())) #get_unique_tweets() #get_users_in_database()
def get_unique_tweets(): conn = sqlite3.connect('twitter_project.db') c = conn.cursor() c.execute("select * from tweets") tweet_dict = {} for each in c.fetchall(): user = unidecode.unidecode_expect_nonascii(each[2][0:15]) if user[0:4] == "RT @": user = user[4:] if user in tweet_dict: tweet_dict[user] += 1 else: tweet_dict[user] = 1 for each in tweet_dict.keys(): print str(each) + ": " + str(tweet_dict[each]) print "total unique tweets: " + str(len(tweet_dict.keys())) #get_unique_tweets() #get_users_in_database()
def parse_speeches(corpus_path): raw_sp2txt = {} proc_sp2txt = {} for subdir, dirs, files in os.walk(corpus_path): for each_file in files: #pprint.pprint("-- processing: {}".format(each_file)) file_path = subdir + os.path.sep + each_file fhandle = open(file_path, 'r') _raw_input = fhandle.read() text = unidecode.unidecode_expect_nonascii(_raw_input) re.sub("[\W\d]", " ", text.lower().strip()) lowers = text.replace('\n',' ').replace('\r',' ') while " " in lowers: lowers = lowers.replace(' ',' ') ''' store raw text -- for sentence extraction ''' raw_sp2txt[each_file] = lowers ''' store no_punctuation for NMF ''' no_punctuation = lowers.translate(None, string.punctuation) proc_sp2txt[each_file] = no_punctuation return proc_sp2txt, raw_sp2txt
def tokenize_file(filename): raw_filename = os.path.join(BOOKCORPUS_RAW_DIR, filename) tokenized_sents = [] with smart_open.open(raw_filename, encoding='utf-8') as f: lines = f.readlines() if lines[0].strip() == '<!DOCTYPE html>': return for line in lines: sents = sent_tokenize(line) for sent in sents: if sent.strip(): if 'http' in sent: continue sent = sent.strip() sent = sent.replace(u'\u2026', '') sent = unidecode.unidecode_expect_nonascii(sent) sent = sent.replace('_', '') words = " ".join(word_tokenize(sent)) words = words.replace('``', '"') words = words.replace('`', '\'') words = words.replace('\'\'', '"') if validate_words(words): tokenized_sents.append(words) if len(tokenized_sents) <= 1: return with smart_open.open(os.path.join(BOOKCORPUS_PROCESSED_DIR, filename), 'w', encoding='utf-8') as f: for line in tokenized_sents: if line[-1] != '\n': line += '\n' f.write(line.decode('utf-8'))
def remove_accents(place: str) -> str: return unidecode_expect_nonascii(place)
stage = all_url_stages[ci] state = all_url_states[ci] ## ajax/json table if "'ajax': jsonURL" in content: ## Construct URL jsonfn = content.split("jsonURL = '")[1].split("';")[0].split("?\'")[0] jsonURL = '/'.join(all_urls[ci].split('/')[:-2]) + '/' + jsonfn ## Request content json_raw = opener.open(six.moves.urllib.request.Request(jsonURL)).read() ## Decode content jsoncontent = unidecode_expect_nonascii(json_raw.decode('utf-8')) ## Convert to dataframe pdf = pd.read_json(jsoncontent, orient='split') ## Clean up dataframe content city_col_name = pdf.columns[pdf.columns.str.startswith('City / Town')].values[0] pdf['City/Town'] = pdf[city_col_name].apply(lambda x: x.split(' (')[0].strip()) pdf['Facility_name_clean'] = pdf['Facility Name'].apply(lambda x: BeautifulSoup(x).text.split(' (PDF')[0].split('\n')[0].split("in new window.'>")[-1]) pdf['Permit_URL'] = pdf['Facility Name'].apply(lambda x: [ BeautifulSoup(x).findAll('a')[j].get('href') for j in range(len(BeautifulSoup(x).findAll('a'))) ]) pdf['Stage'] = stage pdf['State'] = state pdf['Watershed'] = pdf[city_col_name].apply(lambda x: x.split('(')[1][:-1].strip() if '(' in x else np.nan)
if table1[server][params[0]][params[5]][params[6]].has_key(time_hr) == False: table1[server][params[0]][params[5]][params[6]][time_hr] = {} table1[server][params[0]][params[5]][params[6]][time_hr]['server_type'] = server_name table1[server][params[0]][params[5]][params[6]][time_hr]['count'] = int(pair[1]) table1[server][params[0]][params[5]][params[6]][time_hr]['qps'] = int(pair[1])/3600.0 exchange_table = {} for id,name in exchange_data: exchange_table[str(id)] = name exchange_table['0'] = 'Unknown' country_table = {} for id,name in country_data: # name = unicode(name, errors='replace') name = unidecode.unidecode_expect_nonascii(name) # name = name.decode("utf-8", "replace") country_table[str(id)] = name country_table['0'] = 'Unknown' getDictinory(response_table_dca, 'dca') getDictinory(response_table_hkg, 'hkg') get_logger.info('inserting data into influxdb') points = [] now = datetime.datetime.today() for i in table1: for j in table1[i]: if j == 'REC': for k in table1[i][j]: for l in table1[i][j][k]:
def removeNonAscii(val): return unidecode_expect_nonascii(val).replace(",ai", "-")
def jot2pondera(): #os.rename('survey.csv', 'survey.csv.bak') #with open('survey.csv.bak', 'rU') as infile, open('survey.csv', 'w') as outfile: # for line in infile: # outfile.write(line.replace('\r\n','')) #os.remove('survey.csv.bak') columns = {u'Submission ID': 'id', u'Submission Date': 'timestamp', u'¿Dónde estás?': 'geo', u'Tipo de Reporte': 'tipo', u'¿Qué actividades realizará el voluntario?': 'actividad', u'Describe brevemente el estado de tu inmueble': 'inmueble', u'¿Qué necesitas acopiar?': 'acopio', u'¿Qué necesita el hospital?': 'hospital', u'Escribe brevemente qué necesitas': 'necesita', u'¿Cuántos Voluntarios Necesitas?': 'voluntarios', u'Nombre del albergue': 'nombre_albergue', u'¿Qué ofrece el albergue?': 'albergue', u'Sólo si lo necesitas, escribe un breve comentario': 'comentario', } dtypes = {u'Submission ID': int, u'¿Dónde estás?': str, u'Tipo de Reporte': str, u'¿Qué actividades realizará el voluntario?': str, u'Describe brevemente el estado de tu inmueble': str, u'¿Qué necesitas acopiar?': str, u'¿Qué necesita el hospital?': str, u'Escribe brevemente qué necesitas': str, u'¿Cuántos Voluntarios Necesitas?': str, u'Nombre del albergue': str, u'¿Qué ofrece el albergue?': str, u'Sólo si lo necesitas, escribe un breve comentario': str, } url = 'https://www.jotform.com/csv/72647940607059' if os.path.exists('survey.csv'): os.rename('survey.csv','survey.bk') reporte = wget.download(url,'survey.csv') os.rename('survey.csv', 'survey.csv.bak') with open('survey.csv.bak', 'rU') as infile, open('survey.csv', 'w') as outfile: for line in infile: outfile.write(line.replace('\r\n','')) os.remove('survey.csv.bak') df = pd.read_csv(reporte, encoding='utf-8', parse_dates=['Submission Date'], dtype=dtypes, na_values=['']) df.rename(columns=columns, inplace=True) df.replace(np.nan, ' ') df.replace('\n','',regex=True) cols = ['actividad', 'inmueble', 'acopio', 'hospital', 'necesita', 'voluntarios', 'albergue', 'comentario'] # , 'nombre_albergue' df = df.where((pd.notnull(df)), '') for i, row in df.iterrows(): for c in cols: if not isinstance(df.loc[i, c], float): df.loc[i, c] = ud.unidecode_expect_nonascii(df.loc[i, c]) df.loc[:, 'lon'] = df.geo.str.extract('(\d+\.\d+).*(-\d+\.\d+)')[1] df.loc[:, 'lat'] = df.geo.str.extract('(\d+\.\d+).*(-\d+\.\d+)')[0] df.loc[:, 'store_point'] = 'POINT (' + df.lon + ' ' + df.lat + ')' df.loc[:, 'suc'] = '' for i, d in df.iterrows(): s = str(d['timestamp']) + ' ' if str(d['tipo'].encode('utf-8')) == 'Acopio o Solicitud in situ': s += 'Se necesita: ' + ' '.join([str(d['acopio'].encode('utf-8')), str(d['necesita'].encode('utf-8'))]) elif str(d['tipo'].encode('utf-8')) == 'Acopio Hospital': s += 'Se necesita: ' + ' '.join([str(d['hospital']), str(d['necesita'])]) elif str(d['tipo'].encode('utf-8')) == 'Requiero Voluntarios': s += 'Se necesitan ' + str(d['voluntarios']) + \ ' voluntarios para realizar: ' + str(d['necesita']) elif str(d['tipo'].encode('utf-8')) == 'Dar de Alta Albergue': s += str(d['nombre_albergue']) + ' - características: ' + str(d['albergue']) elif str(d['tipo'].encode('utf-8')) == 'Dar de Alta Derrumbe': s += '<br>Derrumbe' elif str(d['tipo'].encode('utf-8')) == 'Dar de Alta Daños': s += '<br>Daño' elif str(d['tipo'].encode('utf-8')) == 'Requiero de Revisión en mi Inmueble': s += 'Descripción de daños: ' + str(d['inmueble']) s = s.replace('nan', '') df.loc[i, 'suc'] = str(df.loc[i, 'suc']) + str(s) + ' ' + str(df.loc[i,'comentario']) #print df.loc[i,'suc'] df.to_csv('db_jot.csv', encoding='utf-8')
def __init__(self, long_string): """ Create document instance for analysis. Opens and reads document to string raw_text. Textract interprets the document format and opens to plain text string (docx, pdf, odt, txt) Args: text (str): string to anaylze. Public attributes: -user: (str) optional string to set username. -path: (str) relative path to document. -abs_path: (str) the absolute path to the document. -file_name: (str) the file name with extension of document (base name). -mime: tbd -guessed_type: makes best guess of mimetype of document. -file_type: returns index[0] from guessed_type. -raw_text: (str) plain text extracted from .txt, .odt, .pdf, .docx, and .doc. -ptext: (str) raw text after a series of regex expressions to eliminate special characters. -text_no_feed: (str) ptext with most new line characters eliminated /n/n stays intact. -sentence_tokens: list of all sentences in a comma separated list derived by nltk. -sentence_count: (int) count of sentences found in list. -passive_sentences: list of passive sentences identified by the passive module. -passive_sentence_count: count of the passive_sentences list. -percent_passive: (float) ratio of passive sentences to all sentences in percent form. -be_verb_analysis: (int) sum number of occurrences of each to be verb (am, is, are, was, were, be, being been). -be_verb_count: tbd -be_verb_analysis: tbd -weak_sentences_all: (int) sum of be verb analysis. -weak_sentences_set: (set) set of all sentences identified as having to be verbs. -weak_sentences_count: (int) count of items in weak_sentences_set. -weak_verbs_to_sentences: (float) proportion of sentences with to be to all sentences in percent (this might not be sound). -word_tokens: list of discreet words in text that breaks contractions up (default nltk tokenizer). -word_tokens_no_punct: list of all words in text including contractions but otherwise no punctuation. -no_punct: (str) full text string without sentence punctuation. -word_tokens_no_punct: uses white-space tokenizer to create a list of all words. -readability_flesch_re: (int) Flesch Reading Ease Score (numeric score) made by textstat module. -readability_smog_index: (int) grade level as determined by the SMOG algorithum made by textstat module. -readability_flesch_kincaid_grade: (int) Flesch-Kincaid grade level of reader made by textstat module. -readability_coleman_liau_index: (int) grade level of reader as made by textstat module. -readability_ari: (int) grade leader of reader determined by automated readability index algorithum implemented by textstat. -readability_linser_write: FIX SPELLING grade level as determined by Linsear Write algorithum implemented by textstat. -readability_dale_chall: (int) grade level based on Dale-Chall readability as determined by textstat. -readability_standard: composite grade level based on readability algorithums. -flesch_re_key: list for interpreting Flesch RE Score. -word_count: word count of document based on white space tokener, this word count should be used. -page_length: (float) page length in decimal format given 250 words per page. -paper_count: (int) number of printed pages given 250 words per page. -parts_of_speech: words with parts of speech tags. -pos_counts: values in word, tag couple grouped in a list (Counter). -pos_total: (int) sum of pos_counts values -pos_freq: (dict) word, ratio of whole -doc_pages: (float) page length based on 250 words per page (warning, this is the second time this attribute is defined). -freq_words: word frequency count not standardized based on the correct word tokener (not ratio, just count). modal_dist: count of auxillary verbs based on word_tokens_no_punct. sentence_count (int): Count the sentence tokens passive_sentences (list): List of all sentences identified as passive passive_sentence_count (int): count of items in passive_sentences be_verb_count (int): count "to be" verbs in text word_tokens_no_punct (list): words separated, stripped of punctuation, made lower case flesch_re_key (str): reading ease score to description freq_words (list or dict): frequency distribution of all words modal_dist (list): frequency distribution of aux verbs """ self.raw_text = long_string self.raw_text = unidecode.unidecode_expect_nonascii(self.raw_text) self.user = "" self.time_stamp = self.timestamp() self.ptext = re.sub('[\u201c\u201d]', '"', self.raw_text) self.ptext = re.sub("\u2014", "--", self.ptext) self.ptext = re.sub(",", ",", self.ptext) self.ptext = re.sub("—", "--", self.ptext) self.ptext = re.sub("…", "...", self.ptext) self.text_no_feed = self.clean_new_lines(self.ptext) self.sentence_tokens = self.sentence_tokenize(self.text_no_feed) self.sentence_count = len(self.sentence_tokens) self.passive_sentences = passive(self.text_no_feed) self.passive_sentence_count = len(self.passive_sentences) self.percent_passive = ( 100 * (float(self.passive_sentence_count) / float(self.sentence_count))) self.percent_passive_round = round(self.percent_passive, 2) self.be_verb_analysis = self.count_be_verbs(self.sentence_tokens) self.be_verb_count = self.be_verb_analysis[0] self.weak_sentences_all = self.be_verb_analysis[1] self.weak_sentences_set = set(self.weak_sentences_all) self.weak_sentences_count = len(self.weak_sentences_set) self.weak_verbs_to_sentences = 100 * float( self.weak_sentences_count) / float(self.sentence_count) self.weak_verbs_to_sentences_round = round( self.weak_verbs_to_sentences, 2) self.word_tokens = self.word_tokenize(self.text_no_feed) self.word_tokens_no_punct = \ self.word_tokenize_no_punct(self.text_no_feed) self.no_punct = self.strip_punctuation(self.text_no_feed) # use this! It make lower and strips symbols self.word_tokens_no_punct = self.ws_tokenize(self.no_punct) self.readability_flesch_re = \ textstat.flesch_reading_ease(self.text_no_feed) self.readability_smog_index = \ textstat.smog_index(self.text_no_feed) self.readability_flesch_kincaid_grade = \ textstat.flesch_kincaid_grade(self.text_no_feed) self.readability_coleman_liau_index = \ textstat.coleman_liau_index(self.text_no_feed) self.readability_ari = \ textstat.automated_readability_index(self.text_no_feed) self.readability_linser_write = \ textstat.linsear_write_formula(self.text_no_feed) self.readability_dale_chall = \ textstat.dale_chall_readability_score(self.text_no_feed) self.readability_standard = \ textstat.text_standard(self.text_no_feed) self.flesch_re_desc_str = self.flesch_re_desc( int(textstat.flesch_reading_ease(self.text_no_feed))) self.polysyllabcount = textstat.polysyllabcount(self.text_no_feed) self.lexicon_count = textstat.lexicon_count(self.text_no_feed) self.avg_syllables_per_word = textstat.avg_syllables_per_word( self.text_no_feed) self.avg_sentence_per_word = textstat.avg_sentence_per_word( self.text_no_feed) self.avg_sentence_length = textstat.avg_sentence_length( self.text_no_feed) self.avg_letter_per_word = textstat.avg_letter_per_word( self.text_no_feed) self.difficult_words = textstat.difficult_words(self.text_no_feed) self.rand_passive = self.select_random(self.passive_sentence_count, self.passive_sentences) if self.weak_sentences: self.rand_weak_sentence = self.select_random( len(self.weak_sentences), self.weak_sentences) if self.word_tokens_no_punct: self.word_count = len(self.word_tokens_no_punct) self.page_length = float(self.word_count) / float(250) self.paper_count = int(math.ceil(self.page_length)) self.parts_of_speech = pos_tag(self.word_tokens_no_punct) self.pos_counts = Counter(tag for word, tag in self.parts_of_speech) self.pos_total = sum(self.pos_counts.values()) self.pos_freq = dict( (word, float(count) / self.pos_total) for word, count in list(self.pos_counts.items())) self.doc_pages = float(float(self.word_count) / float(250)) self.freq_words = \ self.word_frequency(self.word_tokens_no_punct) self.modal_dist = self.modal_count(self.word_tokens_no_punct) # self.ws_tokens = self.ws_tokenize(self.text_no_cr) self.pos_count_dict = list(self.pos_counts.items()) # Model - use for any pos self.modals = self.pos_isolate('MD', self.pos_count_dict) self.preposition_count = self.pos_isolate('IN', self.pos_count_dict) self.adjective_count = self.pos_isolate_fuzzy( 'JJ', self.pos_count_dict) self.adverb_count = self.pos_isolate_fuzzy('RB', self.pos_count_dict) self.proper_nouns = self.pos_isolate_fuzzy('NNP', self.pos_count_dict) self.cc_count = self.pos_isolate('CC', self.pos_count_dict) self.commas = self.char_count(",") self.comma_sentences = self.list_sentences(",") self.comma_example = self.select_random(len(self.comma_sentences), self.comma_sentences) self.semicolons = self.char_count(";") if self.semicolons: self.semicolon_sentences = self.list_sentences(";") self.semicolon_example = self.select_random( len(self.semicolon_sentences), self.semicolon_sentences) self.lint_suggestions = lint(self.raw_text)
def merge_review_jsons(folder_name): path = '../../data/raw/' + folder_name # loop over files in the folder, merge the dataframes for fname in os.listdir(path): full_path = path + '/' + fname tmp = pd.read_json(full_path) if 'df' not in vars(): df = tmp else: df = pd.concat([df, tmp]) # each entry is a dictionary, only length 3 was populated with data from # the scraper num_els = df['results'].apply(lambda x: len(x)) df = df[num_els == 3] # create empty lists, to be appended to by expanding each dictionary names_list = [] ids_list = [] reviews_list = [] # loop over each row, open the dictionary to get all reviews associated # with each wine...then create lists of the reviews, corresponding names, # and ids. then create df from the lists for i in range(df.shape[0]): entry = df['results'].iloc[i] # decode unicode chars name = unidecode_expect_nonascii(entry['wine_name']) # split wine id from url wine_id = int(entry['wine_name_url'].split('?iWine=')[1]) # get dictionary of wine reviews for each wine reviews = entry['wine_reviews'] n_reviews = len(reviews) # append the names and ids to each list n_reviews times # when we create dataframe these will then be in proper rows associated # with each review names_list += [name] * n_reviews ids_list += [wine_id] * n_reviews # loop over the reviews, decode the unicode chars, then append to list for j in range(n_reviews): reviews_list.append(unidecode_expect_nonascii(reviews[j]['name'])) # create dataframe, rename columns unpacked = pd.DataFrame(zip(names_list, ids_list, reviews_list)) unpacked.columns = ['wine_name', 'id', 'review_text'] # remove rows with no review text not_null = ~unpacked['review_text'].isnull() unpacked = unpacked[not_null] # drop duplicates unpacked.drop_duplicates(inplace=True) return unpacked
def DownloadPageHistory(browser, historyRoot, pageName, justUpdate): # Open the Fancy 3 page in the browser browser.get("http://fancyclopedia.org/" + pageName + "/noredirect/t") # Get the first two letters in the page's name # These are used to disperse the page directories among many directotoes so as to avoid having so many subdirectores that Windows Explorer breaks when viewing it d1 = pageName[0] d2 = d1 if len(pageName) > 1: d2 = pageName[1] # Check to see what we have already downloaded. # Any history already downloaded will be in historyRoot/d1/d2/pageName+nnnn, where nnnn is the version number # Read historyRoot/d1/d2 and make a list of the version number of all directories found # The version directories are named Vnnnn pagePath = os.path.join(historyRoot, d1, d2, pageName) existingVersions = [] lowestVersionNeeded = 0 if os.path.exists(pagePath): existingVersions = [ entry for entry in os.scandir(pagePath) if entry.is_dir() ] # All the subdirectory objects existingVersions = [entry.name for entry in existingVersions ] # Make into a list of subdirectory names existingVersions = [ entry[1:] for entry in existingVersions if entry[0] == 'V' and len(entry) == 5 and entry[1].isdigit() and entry[2].isdigit() and entry[3].isdigit() and entry[4].isdigit() ] existingVersions = [int(entry) for entry in existingVersions] # Convert to number # Now figure out what the lowest version still needed is. (Knowing this may allow us to optimize page loads.) if len(existingVersions) > 0: lowestVersionNeeded = max(existingVersions) + 1 i = 0 while i < max(existingVersions) + 1: if i not in existingVersions: lowestVersionNeeded = i # Mote that this will be max+1 if there are no gaps in the list break i = i + 1 print(" First version needed: " + str(lowestVersionNeeded)) # Page found? errortext = "The page <em>" + pageName.replace( "_", "-") + "</em> you want to access does not exist." if errortext in browser.page_source: print("*** Page does not exist: " + pageName) return # Find the history button and press it browser.find_element_by_id('history-button').send_keys(Keys.RETURN) time.sleep(0.5) # Just-in-case # Wait until the history list has loaded try: WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.ID, 'revision-list'))) except: print( "***Oops. Exception while waiting for the history list to load in " + pageName + ": Retrying") WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.ID, 'revision-list'))) # Step over the pages of history lines (if any) # The pages are a series of spans creating a series of boxes, each with a number in it. There is a box labeled "current" and we want to click on the *next* box. firstTime = True terminate = False while not terminate: # There may be a "pager" -- a series of buttons to show successive pages of history try: pagerDiv = browser.find_element_by_xpath( '//*[@id="revision-list"]/div') except SeEx.NoSuchElementException: pagerDiv = None except: print("***Oops. Exception while looking for pager div in " + pageName) return if pagerDiv == None and not firstTime: break # If there are multiple pages of history, then before starting the second and subsequent loops, we need to go to the next page if pagerDiv != None and not firstTime: # Find the current page indicator els = pagerDiv.find_elements_by_tag_name("span") # And click the *next*, if any, to go to the next page for i in range(0, len(els)): if els[i].get_attribute("class") == "current": if i + 1 < len(els): els[i + 1].find_element_by_tag_name("a").send_keys( Keys.RETURN) else: terminate = True break if terminate: break firstTime = False # Get the history list historyElements, id = ExtractHistoryList(browser) # Note that the history list is from newest to oldest, but we don't care since we traverse them all # The structure of a line is # The revision number followed by a "." # A series of single letters (these letters label buttons) # The name of the person who updated it # The date # An optional comment # This calls for a Regex rec = Regex.compile( "^" # Start at the beginning "(\d+)." # Look for a number at least one digit long followed by a period and space "( [A-UW-Z]| [A-Z] [A-UW-Z]|)?" # Look for a single capital letter or two separated by spaces or this could be missing # We skip the V as the final letter to avoid conflict with the next pattern "( V S R | V S )" # Look for either ' V S ' or ' V S R ' "(.*)" # Look for a name "(\d+ [A-Za-z]{3,3} 2\d{3,3})" # Look for a date in the 2000s of the form 'dd mmm yyyy' "(.*)$") # Look for an optional comment #TODO: Don't load earlier history pages unless we need to. i = 0 while i < len( historyElements ): # We do this kludge because we need to continually refresh historyElements. While it may become stale, at least it doesn't change size # Regenerate the history list, as it may have become stale # This while loop, et al, is to allow retries since sometimes it doesn't seem to load in time historyElements = None count = 0 gps = None while (historyElements == None or gps is None) and count < 5: try: historyElements = browser.find_element_by_xpath( '//*[@id="revision-list"]/table/tbody' ).find_elements_by_xpath("tr") time.sleep(0.1) id = historyElements[i + 1].get_attribute("id").replace( "revision-row-", "" ) # This code is here just to trigger an exception if not loaded fully t = historyElements[ i + 1].text # This code is here just to trigger an exception if not loaded fully historyElements = historyElements[ 1:] # The first row is column headers, so skip them. el = historyElements[i] id = el.get_attribute("id").replace("revision-row-", "") t = el.text # print("t='"+t+"'") m = rec.match(t) gps = m.groups() if gps == None: print("***gps is None") if len(gps) < 5: print("***gps is too short") user = gps[3] except Exception as exception: # Wait and try again time.sleep(1) count = count + 1 print("... Retrying historyElements(2): " + type(exception).__name__ + " count=" + str(count)) if historyElements == None and count >= 5: print("***Could not get historyElements(2) after five tries.") if gps == None: print("***gps is None (2)") # Get the revision number. Skip it if it's in the list of existing revisions revNum = gps[0] if int(revNum) not in existingVersions: # The Regex greedy capture of the user name captures the 1st digit of 2-digit dates. This shows up as the user name ending in a space followed by a single digit. # Fix this if necessary user = gps[3] date = gps[4] if user[-2:-1] == " " and user[-1:].isdigit(): date = user[-1:] + gps[4] user = user[:-2] # Click on the view source button for this row el.find_elements_by_tag_name( "td")[3].find_elements_by_tag_name("a")[1].click() # This while loop, et al, is to allow retries since sometimes it doesn't seem to load in time divRevList = None count = 0 while divRevList == None and count < 5: try: divRevList = browser.find_element_by_xpath( '//*[@id="revision-list"]/table/tbody') except SeEx.NoSuchElementException: # Wait and try again time.sleep(1) count = count + 1 if divRevList == None and count >= 5: print("***Could not get divRevList after five tries.") source = None count = 0 while source == None and count < 5: try: source = divRevList.find_element_by_xpath( '//*[@id="history-subarea"]/div').text except (SeEx.NoSuchElementException, SeEx.StaleElementReferenceException): # Wait and try again time.sleep(1) count = count + 1 if source == None and count >= 5: print("***Could not get source after five tries.") del divRevList # Write out the xml data root = ET.Element("data") el = ET.SubElement(root, "number") number = str(gps[0]) el.text = number el = ET.SubElement(root, "ID") el.text = str(id) el = ET.SubElement(root, "type") el.text = str(gps[1]) el = ET.SubElement(root, "name") el.text = str(user) el = ET.SubElement(root, "date") el.text = str(date) el = ET.SubElement(root, "comment") el.text = str(gps[5]) # And write the xml out to file <localName>.xml. tree = ET.ElementTree(root) # OK, we have everything. Start writing it out. # Make sure the target directory exists seq = ("0000" + number)[-4:] # Add leading zeroes dir = os.path.join(pagePath, "V" + seq) pathlib.Path(dir).mkdir(parents=True, exist_ok=True) print(" Loaded V" + seq) # Write the directory contents tree.write(os.path.join(dir, "metadata.xml")) with open(os.path.join(dir, "source.txt"), 'a') as file: file.write(unidecode.unidecode_expect_nonascii(source)) i = i + 1 # The history pages are loaded recent (highest version number) first. # Check to see if subsequent pages could *possibly* have a version we need. # If not, end the loop over pages of history lists. if int(revNum) < lowestVersionNeeded: break # Download the files currently attached to this page # Find the files button and press it #TODO: Avoid downloading files which already have been downloaded. elem = browser.find_element_by_id('files-button') elem.send_keys(Keys.RETURN) time.sleep(0.7) # Just-in-case # Wait until the history list has loaded wait = WebDriverWait(browser, 10) #wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'page-files'))) try: els = browser.find_element_by_class_name( "page-files").find_elements_by_tag_name("tr") for i in range(1, len(els)): h = els[i].get_attribute("outerHTML") url, linktext = Helpers.GetHrefAndTextFromString(h) urllib.request.urlretrieve( "http://fancyclopedia.org" + url, os.path.join(os.path.join(pagePath, linktext))) print(" " + str(len(els) - 1), " files downloaded.") except: k = 0 # Update the donelist with open(os.path.join(historyRoot, "donelist.txt"), 'a') as file: file.write(pageName + "\n") return
def scrape_url(): #num1 = randint(1, 10) #num2 = randint(1, 10) if request.method == 'POST': if request.form['email_add']: return render_template('404.html') #if (request.form['userSolution'] != request.form['sum']): # return redirect(request.url) result = request.form if 'plaintext' not in result: return redirect(request.url) #if 'author' not in result: # author = "" url = result['plaintext'] target = urllib.request.Request(url) target.add_header('Accept-Encoding', 'utf-8') target.add_header( 'User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' ) response = urllib.request.urlopen(target) #soup = BS(response.read().decode('utf-8'), convertEntities=BS.HTML_ENTITIES) #soup = BS(response.read().decode('utf-8', 'ignore'), convertEntities=BS.HTML_ENTITIES) soup = BeautifulSoup(response, 'html.parser') # except urllib2.HTTPError, e: # print('We failed with error code %s' % e.code) # if e.code == 404: # render_template('404.html') # elif e.code == 403: # render_template('403.html') # else: # pass #paragraphs = "" #for s in soup.findAll('br'): # paragraphs += s.get_text(separator=" ", strip=True) paragraphs = soup.findAll('p') #title = soup.find('title') title = soup.title.string #h = HTMLParser() #paragraphs = h.unescape(paragraphs) #title = title.getText() #title = h.unescape(title) plaintext = "" for p in paragraphs: plaintext += p.text + "\n\n" #plaintext += p.getText() + '\n\n' #plaintext += p.getText(" ") + '\n\n' prose = unidecode_expect_nonascii(plaintext) #prose = paragraphs apis = result['apis'] author = result['author'] administrator = result['administrator'] admin_notes = result['notes'] global Doc Doc = read_document.Sample(prose, author, apis) Doc.administrator = administrator Doc.admin_notes = admin_notes if title: Doc.title = title else: Doc.title = url if Doc: return redirect(url_for('feedback', timestamp=Doc.title)) else: return render_template('url.html') else: return render_template('url.html')
def __call__(self, text): try: from unidecode import unidecode_expect_nonascii except ImportError: raise ImportError('Please install package `unidecode`') return unidecode_expect_nonascii(text)
def run(self): global allfiles global thread_count global pathq empty = False filesize = 0 path, size, filetype, modify_time, create_time, access_time, scan_time, uid, gid = [], [], [], [], [], [], [], [], [] while True: filelist = [] try: scanpath = pathq.get() logging.info('start at: '+start+ "\n") logging.info(scanpath+ "\n") if os.path.isdir(scanpath): filelist = os.listdir(scanpath) else: print(scanpath+'is not '+'valid path') continue if len(filelist) > 100000: print(scanpath+' has: '+str(len(filelist))+' files'+', do not scan now') continue for file in filelist: filename = os.path.join(scanpath,file) if os.path.exists(filename): if os.path.isdir(filename): pathq.put(filename) elif os.path.isfile(filename): lock.acquire() allfiles += 1 print(allfiles) lock.release() fileinfo = os.stat(filename) size.append(fileinfo.st_size) filetype.append(fileinfo.st_type) modify_time.append(self.timetos(fileinfo.st_mtime)) create_time.append(self.timetos(fileinfo.st_ctime)) access_time.append(self.timetoint(fileinfo.st_atime)) scan_time.append(datetime.datetime.now()) uid.append(fileinfo.st_uid) gid.append(fileinfo.st_gid) filename = unidecode.unidecode_expect_nonascii(filename) path.append(filename) filesize += 1 if filesize == 50000: print("Write InfluxDB...") df = pd.DataFrame({"path" : path,"size": size,"filetype": filetype,"modify_time": modify_time,"create_time": create_time,"access_time" : access_time, "scan_time": scan_time,"uid": uid,"gid": gid}, index=pd.date_range(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), freq='U', periods=len(path))) client.write_points(df, 'infos', protocol='json') print("Write Done") logging.info('start at: '+start) logging.info('end at: '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) logging.info("filesize: %s" % allfiles) path, size, filetype, modify_time, create_time, access_time, scan_time, uid, gid = [], [], [], [], [], [], [], [], [] filesize = 0 except Queue.Empty as e: empty = True print("Write InfluxDB...") df = pd.DataFrame({"path" : path,"size": size,"filetype": filetype,"modify_time": modify_time,"create_time": create_time,"access_time" : access_time, "scan_time": scan_time,"uid": uid,"gid": gid}, index=pd.date_range(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), freq='U', periods=len(path))) client.write_points(df, 'infos', protocol='json') print("Write Done") logging.info('start at: '+start) logging.info('end at: '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) logging.info("filesize: %s" % allfiles) path, size, filetype, modify_time, create_time, access_time, scan_time, uid, gid = [], [], [], [], [], [], [], [], [] print 'scanning end, now you can enter ctrl+c to stop this program' break except OSError as e: print 'OSError:', e continue except Exception as e: print 'Exception:', e continue finally: if not empty: pathq.task_done()
#val = int(val) val =1 host = 'localhost' port = 27017 print('We are in dbConnection.py') try: connection = MongoClient('localhost', 27017) db = connection.core collection = db.rss_feed_entry for x in collection.find({}, {"_id": 0, "title": 1,"newsCategory":1}): if(x['newsCategory']=="Emerging Threats and Cyberattacks" or x['newsCategory']== "Cyber Hacks and Incident" or x['newsCategory']=="Threat Actors and Tools"): print(x['newsCategory']) dbtext = (dbtext + (unidecode.unidecode_expect_nonascii(x['title']))) dbtext1 = sent_tokenize(dbtext) # dbtext1 = list(dbtext.split(" ")) # print('dbtext->', dbtext1) print("Connected successfully!!!") except Exception as e: print(e) # Preprocessing def remove_string_special_characters(s): # removes special characters with ' ' stripped = re.sub('[^a-zA-z\s]', '', s) stripped = re.sub('_', '', stripped) # Change any white space to one space stripped = re.sub('\s+', ' ', stripped)