Exemplo n.º 1
0
def create_corpus_from_web(url_file, raw=False):
    raw_sp2txt = {}
    proc_sp2txt = {}
    speech_titles = {}
    U = open(url_file)
    url_list = [url.strip() for url in U.readlines()]
    for doc_index, url in enumerate(url_list):
        pprint.pprint('Grabbing URL: ' + str(url))

        article = grab_link(url)
        if not (article and article.cleaned_text and article.title):
            pprint.pprint('Skipping. No content in URL: ' + url)
            continue

        title = unidecode.unidecode_expect_nonascii(article.title)

        speech_titles[doc_index] = title

        _raw_input = article.cleaned_text
        text = unidecode.unidecode_expect_nonascii(_raw_input)
        re.sub("[\W\d]", " ", text.lower().strip())
        lowers = text.replace('\n',' ').replace('\r',' ')
        while "  " in lowers:
            lowers = lowers.replace('  ',' ')


        ''' store raw text -- for sentence extraction '''
        raw_sp2txt[doc_index] = lowers

        ''' store no_punctuation for NMF '''
        no_punctuation = lowers.translate(None, string.punctuation)
        proc_sp2txt[doc_index] = no_punctuation

    U.close()
    return proc_sp2txt, raw_sp2txt, speech_titles
Exemplo n.º 2
0
def create_corpus_from_html(raw_html_path, raw=False):
    raw_sp2txt = {}
    proc_sp2txt = {}
    speech_titles = {}
    for subdir, dirs, files in os.walk(raw_html_path):
        for doc_index, each_file in enumerate(files):
            file_path = subdir + os.path.sep + each_file
            htmlfile = open(file_path, 'r')
            raw_content = htmlfile.read()
            article = Goose().extract(raw_html=raw_content)
            if not (article and article.cleaned_text and article.title):
                continue

            #print 'Processing article: ', article.title
            speech_titles[doc_index] = unidecode.unidecode_expect_nonascii(article.title)
            text = unidecode.unidecode_expect_nonascii(article.cleaned_text)

            re.sub("[\W\d]", " ", text.lower().strip())
            lowers = text.replace('\n',' ').replace('\r',' ')
            while "  " in lowers:
                lowers = lowers.replace('  ',' ')

            ''' store raw text -- for sentence extraction '''
            raw_sp2txt[doc_index] = lowers

            ''' store no_punctuation for NMF '''
            no_punctuation = lowers.translate(None, string.punctuation)
            proc_sp2txt[doc_index] = no_punctuation

            htmlfile.close()

    return proc_sp2txt, raw_sp2txt, speech_titles
def paste():
    #num1 = randint(1, 10)
    #num2 = randint(1, 10)
    if request.method == 'POST':
        if request.form['email_add']:
            return render_template('404.html')
        #if (request.form['userSolution'] != request.form['sum']):
        #    return redirect(request.url)
        result = request.form
        if 'plaintext' not in result:
            flash('No text submitted')  # does not work yet
            return redirect(request.url)
        if 'author' not in result:
            return redirect(request.url)
        prose = unidecode_expect_nonascii(result['plaintext'])
        apis = result['apis']
        author = result['author']
        administrator = result['administrator']
        admin_notes = result['notes']
        global Doc

        Doc = read_document.Sample(prose, author, apis)
        Doc.administrator = administrator
        Doc.admin_notes = admin_notes
        Doc.timestamp = datetime.now()
        return redirect(url_for('feedback', timestamp=Doc.timestamp))
    return render_template('paste.html')
Exemplo n.º 4
0
def parse_pdf_using_slate(filepath):
    """
    Parses the PDF file and returns its text in json format.
    :input: filepath: PDF file path which you want parse.
    :output: returns and creats a json file containg the text from PDF.
    """
    all_questions = []
    with open(filepath) as f:
    	document = slate.PDF(f)

    count = 0 
    for each_page in document:
    	questions = re.split(r"\d+[.]", unidecode.unidecode_expect_nonascii(each_page))
    	for each in questions:
            # print each
	    splited_question = each.split("(A")
            count = count + 1
	    try:
                question_dict = {}
            	if splited_question[0] != "":
            	    question_dict["question_statement"] = splited_question[0]
            	    question_dict["answer_options"] = "(A" + splited_question[1]
                    all_questions.append(question_dict)
	    except:
                pass  # clock 9.50 pattern fails
    with open("data/parsed_questions.json", "w") as q_file:
	json.dump(all_questions, q_file, indent=4)
    return all_questions
Exemplo n.º 5
0
 async def on_member_update(before, after):
     g = after.guild
     isascii = lambda s: len(s) == len(s.encode())
     if after.display_name.startswith(
             tuple(chars)):  # BEGIN AUTO DEHOIST MEME
         exists = (lambda: list(
             r.table('settings').filter(lambda a: a['guild'] == str(
                 g.id)).run(self.conn)) != [])()
         if not exists:
             return
         settings = list(
             r.table('settings').filter(
                 lambda a: a['guild'] == str(g.id)).run(self.conn))[0]
         if 'auto_dehoist' in settings.keys():
             if settings['auto_dehoist']:
                 try:
                     await after.edit(
                         nick=
                         f'{dehoist_char}{after.display_name[0:31]}',
                         reason='[Automatic dehoist]')
                 except discord.Forbidden:
                     return
     if isascii(
             after.display_name
     ) == False and not after.display_name.startswith(dehoist_char):
         exists = (lambda: list(
             r.table('settings').filter(lambda a: a['guild'] == str(
                 g.id)).run(self.conn)) != [])()
         if not exists:
             return
         settings = list(
             r.table('settings').filter(
                 lambda a: a['guild'] == str(g.id)).run(self.conn))[0]
         if 'auto_decancer' in settings.keys():
             if settings['auto_decancer']:
                 aaa = unidecode.unidecode_expect_nonascii(
                     after.display_name)
                 if len(aaa) > 32:
                     aaa = aaa[0:32 - 3] + '...'
                 try:
                     await after.edit(nick=aaa,
                                      reason='[Automatic decancer]')
                 except discord.Forbidden:
                     return
     if before.roles == after.roles:
         return
     if len(before.roles) < len(after.roles):
         return
     # they had a role removed from them
     if after.roles == [after.guild.default_role]:
         # no roles; should be after a manual untoss
         try:
             if self.rolebans[after.id][after.guild.id] in [None, []]:
                 return  # they weren't rolebanned
             await after.edit(
                 roles=self.rolebans[after.id][after.guild.id],
                 reason='[Manual role restore]')
             self.rolebans[after.id][after.guild.id] = None
         except KeyError or discord.Forbidden:
             return
 def _normalize_string(self, string):
     ret_string = ''
     for char in string:
         if re.match(u'[Α-Ωα-ωÅ]', char) is not None:
             ret_string += str(char)
         else:
             ret_string += str(unidecode_expect_nonascii(str(char)))
     return ret_string
    def _normalize_string(self, string):
        ret_string = ''
        for char in string:
            if re.match('[Α-Ωα-ωÅ]', char) is not None:
                ret_string += char
            else:
                ret_string += unidecode_expect_nonascii(char)

        return ' '.join(ret_string.split())
Exemplo n.º 8
0
    async def decancer(self, ctx, member: discord.Member):
        '"Decancer" a member, or strip all the non-ASCII characters from their name. Useful to make your chat look good.'
        if ctx.author.permissions_in(ctx.channel).manage_nicknames:
            cancer = member.display_name
            decancer = unidecode.unidecode_expect_nonascii(cancer)
            # decancer = re.sub(r'\D\W', '', decancer)
            if len(decancer) > 32:
                decancer = decancer[0:32 - 3] + "..."

            await member.edit(nick=decancer)
            await ctx.send(
                f'Successfully decancered {cancer} to ​`{decancer}​`.')

        else:
            cancer = member.display_name
            decancer = unidecode.unidecode_expect_nonascii(cancer)
            await ctx.send(
                f'The decancered version of {cancer} is ​`{decancer}​`.')
def convert_text(text):
    """
    Function that converts text to unicode and checks if there are empty strings
    INPUT: text from the consumer complaint narratives
    OUTPUT: original text plus tag for empty strings
    """
    if (text == ''):
        print "FOUND MISSING TEXT"
        return "--MISSING INFO--"
    else:
        return unidecode.unidecode_expect_nonascii(text)
Exemplo n.º 10
0
def scan(app, threadnums, whitepaths, blackpaths, outputpath):
    global allfiles
    global pathq
    pathq = whitepaths
    filelist = []
    try:
        scanpath = pathq.get()
        if scanpath not in blackpaths and os.path.isdir(scanpath):
            filelist = os.listdir(scanpath)
            if len(filelist) < 100000:
                flist = []
                for file in filelist:
                    filename = os.path.join(scanpath, file)
                    if os.path.exists(filename):
                        if os.path.isdir(filename):
                            pathq.put(filename)
                            if threading.activeCount() < threadnums:
                                t = threading.Thread(target=scan)
                                t.start()
                        elif os.path.isfile(filename):
                            lock.acquire()
                            allfiles += 1
                            print(app + ':%s' % allfiles)
                            lock.release()
                            fileinfo = os.stat(filename)
                            f = scanfile(
                                fileinfo.st_type, fileinfo.st_size,
                                fileinfo.st_mtime,
                                fileinfo.st_atime, fileinfo.st_ctime,
                                datetime.datetime.now(), fileinfo.st_uid,
                                fileinfo.st_gid,
                                unidecode.unidecode_expect_nonascii(filename))
                            flist.append(f)
                dir = scandir(scanpath, flist)
                with open(outputpath, 'w') as fw:
                    fw.write(json.dumps(dir, default=lambda obj: obj.__dict__))
                flist = []
            else:
                print(scanpath + ' has: ' + str(len(filelist)) + ' files' +
                      ', do not scan now')
        else:
            print(scanpath + 'is not ' + 'valid path')

    except OSError as e:
        print 'OSError:', e
        logging.info('error at: ' +
                     datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
                     'in' + scanpath + 'OSError:' + e)
    except Exception as e:
        print 'Exception:', e
        logging.info('error at: ' +
                     datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
                     'in' + scanpath + 'Exception:' + e)
Exemplo n.º 11
0
def unicodeToAscii(s):
	#s = s.decode('utf-8')		# da stringa a utf-8
	if type(s) == str:
		s = unicode(s, 'utf-8')	# da stringa a utf-8
	s = unidecode_expect_nonascii(s)	# da utf-8 sostituisce caratteri "strani" in stringa ASCII
	s = unicode(s)		# ad unicode

	return ''.join(
		c for c in unicodedata.normalize('NFD', s)
		if unicodedata.category(c) != 'Mn'
		and c in all_letters
	)
Exemplo n.º 12
0
def merge_csvs():
    # columns to keep from scraped data
    cols = [
        'results_wine_name', 'results_wine_name_url',
        'results_wine_reviews_name'
    ]

    # ten files initially scraped, iterate over them and merge
    for i in range(10):
        tmp = pd.read_csv('../../data/raw/first_1000/run_results' + str(i) +
                          '.csv')
        tmp = tmp[cols]

        if 'df' not in vars():
            df = tmp
        else:
            df = pd.concat([df, tmp])

    # rename columns
    df.columns = ['wine_name', 'id', 'review_text']

    # remove null values
    not_null = ~df['review_text'].isnull()

    df = df[not_null]

    # drop duplicates (if created by scraper)
    df.drop_duplicates(inplace=True)

    # decode unicode characters
    df['wine_name'] = df['wine_name'].apply(
        lambda x: unidecode_expect_nonascii(x))
    df['review_text'] = df['review_text'].apply(
        lambda x: unidecode_expect_nonascii(x))

    # split the wine id from the url
    df['id'] = df['id'].apply(lambda x: int(x.split('?iWine=')[1]))

    return df
Exemplo n.º 13
0
def filter_tokenized_file(filename, vocab, min_len=5):
    sents = []
    filters = {
        'copyright', 'chapter', 'edition', 'license', 'licensed', 'published'
    }
    with smart_open.open(os.path.join(BOOKCORPUS_PROCESSED_DIR, filename),
                         encoding='utf-8') as f:
        for line in f:
            sent = line.replace('\n', '')
            sent = sent.replace(u'\u2026', '')
            sent = unidecode.unidecode_expect_nonascii(sent)
            sent = sent.replace('``', '"')
            sent = sent.replace('`', '\'')
            sent = sent.replace('\'\'', '"')

            words = sent.split()

            if len(words) < min_len:
                continue

            num_punk = 0
            num_known = 0
            filter_flag = False

            for word in words:
                if word.lower() in filters:
                    filter_flag = True
                    break

                if not word.isalnum():
                    num_punk += 1

                elif word in vocab:
                    num_known += 1

            if filter_flag:
                continue

            thresh = len(words) * 0.5
            if num_punk >= thresh or num_known < 1:
                continue

            sents.append(sent)

    if len(sents) > 0:
        with smart_open.open(os.path.join(BOOKCORPUS_AUTHOR_DIR, filename),
                             'w',
                             encoding='utf-8') as f:
            for sent in sents:
                f.write(sent.decode('utf-8') + '\n')
Exemplo n.º 14
0
def get_unique_tweets():
    conn = sqlite3.connect('twitter_project.db')
    c = conn.cursor()
    c.execute("select * from tweets")
    tweet_dict = {}
    for each in c.fetchall():
        user = unidecode.unidecode_expect_nonascii(each[2][0:15])

        if user[0:4] == "RT @":
            user = user[4:]
        if user in tweet_dict:
            tweet_dict[user] +=1
        else:
            tweet_dict[user] = 1
    for each in tweet_dict.keys():
        print str(each)+": "+str(tweet_dict[each])

    print "total unique tweets: "+str(len(tweet_dict.keys()))
#get_unique_tweets()
#get_users_in_database()
Exemplo n.º 15
0
def get_unique_tweets():
    conn = sqlite3.connect('twitter_project.db')
    c = conn.cursor()
    c.execute("select * from tweets")
    tweet_dict = {}
    for each in c.fetchall():
        user = unidecode.unidecode_expect_nonascii(each[2][0:15])

        if user[0:4] == "RT @":
            user = user[4:]
        if user in tweet_dict:
            tweet_dict[user] += 1
        else:
            tweet_dict[user] = 1
    for each in tweet_dict.keys():
        print str(each) + ": " + str(tweet_dict[each])

    print "total unique tweets: " + str(len(tweet_dict.keys()))


#get_unique_tweets()
#get_users_in_database()
Exemplo n.º 16
0
def parse_speeches(corpus_path):
    raw_sp2txt = {}
    proc_sp2txt = {}
    for subdir, dirs, files in os.walk(corpus_path):
        for each_file in files:
            #pprint.pprint("-- processing: {}".format(each_file))
            file_path = subdir + os.path.sep + each_file
            fhandle = open(file_path, 'r')
            _raw_input = fhandle.read()
            text = unidecode.unidecode_expect_nonascii(_raw_input)
            re.sub("[\W\d]", " ", text.lower().strip())
            lowers = text.replace('\n',' ').replace('\r',' ')
            while "  " in lowers:
                lowers = lowers.replace('  ',' ')

            ''' store raw text -- for sentence extraction '''
            raw_sp2txt[each_file] = lowers

            ''' store no_punctuation for NMF '''
            no_punctuation = lowers.translate(None, string.punctuation)
            proc_sp2txt[each_file] = no_punctuation

    return proc_sp2txt, raw_sp2txt
Exemplo n.º 17
0
def tokenize_file(filename):
    raw_filename = os.path.join(BOOKCORPUS_RAW_DIR, filename)
    tokenized_sents = []
    with smart_open.open(raw_filename, encoding='utf-8') as f:
        lines = f.readlines()
        if lines[0].strip() == '<!DOCTYPE html>':
            return

        for line in lines:
            sents = sent_tokenize(line)
            for sent in sents:
                if sent.strip():
                    if 'http' in sent:
                        continue

                    sent = sent.strip()
                    sent = sent.replace(u'\u2026', '')
                    sent = unidecode.unidecode_expect_nonascii(sent)
                    sent = sent.replace('_', '')
                    words = " ".join(word_tokenize(sent))
                    words = words.replace('``', '"')
                    words = words.replace('`', '\'')
                    words = words.replace('\'\'', '"')

                    if validate_words(words):
                        tokenized_sents.append(words)

    if len(tokenized_sents) <= 1:
        return

    with smart_open.open(os.path.join(BOOKCORPUS_PROCESSED_DIR, filename),
                         'w',
                         encoding='utf-8') as f:
        for line in tokenized_sents:
            if line[-1] != '\n':
                line += '\n'
            f.write(line.decode('utf-8'))
Exemplo n.º 18
0
def remove_accents(place: str) -> str:
    return unidecode_expect_nonascii(place)
 
 stage = all_url_stages[ci]
 state = all_url_states[ci]
     
 ## ajax/json table
 if "'ajax': jsonURL" in content:
     
     ## Construct URL
     jsonfn = content.split("jsonURL = '")[1].split("';")[0].split("?\'")[0]
     jsonURL = '/'.join(all_urls[ci].split('/')[:-2]) + '/' + jsonfn
     
     ## Request content
     json_raw = opener.open(six.moves.urllib.request.Request(jsonURL)).read()
     
     ## Decode content
     jsoncontent = unidecode_expect_nonascii(json_raw.decode('utf-8'))
     
     ## Convert to dataframe
     pdf = pd.read_json(jsoncontent, orient='split')
     
     ## Clean up dataframe content
     city_col_name = pdf.columns[pdf.columns.str.startswith('City / Town')].values[0]
     pdf['City/Town'] = pdf[city_col_name].apply(lambda x: x.split(' (')[0].strip())
     pdf['Facility_name_clean'] = pdf['Facility Name'].apply(lambda x: BeautifulSoup(x).text.split(' (PDF')[0].split('\n')[0].split("in new window.'>")[-1])
     pdf['Permit_URL'] = pdf['Facility Name'].apply(lambda x: [
             BeautifulSoup(x).findAll('a')[j].get('href') 
             for j in range(len(BeautifulSoup(x).findAll('a')))
             ])
     pdf['Stage'] = stage
     pdf['State'] = state
     pdf['Watershed'] = pdf[city_col_name].apply(lambda x: x.split('(')[1][:-1].strip() if '(' in x else np.nan)
Exemplo n.º 20
0
						if table1[server][params[0]][params[5]][params[6]].has_key(time_hr) == False:
							table1[server][params[0]][params[5]][params[6]][time_hr] = {}
							table1[server][params[0]][params[5]][params[6]][time_hr]['server_type'] = server_name 
							table1[server][params[0]][params[5]][params[6]][time_hr]['count'] = int(pair[1])
							table1[server][params[0]][params[5]][params[6]][time_hr]['qps'] = int(pair[1])/3600.0


	exchange_table = {}
	for id,name in exchange_data:
	  	exchange_table[str(id)] = name
	exchange_table['0'] = 'Unknown'

	country_table = {}
	for id,name in country_data:
		# name = unicode(name, errors='replace')
		name = unidecode.unidecode_expect_nonascii(name)
		# name = name.decode("utf-8", "replace")
	  	country_table[str(id)] = name
	country_table['0'] = 'Unknown'

	getDictinory(response_table_dca, 'dca')
	getDictinory(response_table_hkg, 'hkg')

	get_logger.info('inserting data into influxdb')
	points = []
	now = datetime.datetime.today()
	for i in table1:
		for j in table1[i]:
			if j == 'REC':		
				for k in table1[i][j]:
					for l in table1[i][j][k]:
Exemplo n.º 21
0
 def removeNonAscii(val):
     return unidecode_expect_nonascii(val).replace(",ai", "-")
Exemplo n.º 22
0
def jot2pondera():

    #os.rename('survey.csv', 'survey.csv.bak')
    #with open('survey.csv.bak', 'rU') as infile, open('survey.csv', 'w') as outfile:
    #    for line in infile:
    #        outfile.write(line.replace('\r\n',''))
    #os.remove('survey.csv.bak')

    columns = {u'Submission ID': 'id',
               u'Submission Date': 'timestamp',
               u'¿Dónde estás?': 'geo',
               u'Tipo de Reporte': 'tipo',
               u'¿Qué actividades realizará el voluntario?': 'actividad',
               u'Describe brevemente el estado de tu inmueble': 'inmueble',
               u'¿Qué necesitas acopiar?': 'acopio',
               u'¿Qué necesita el hospital?': 'hospital',
               u'Escribe brevemente qué necesitas': 'necesita',
               u'¿Cuántos Voluntarios Necesitas?': 'voluntarios',
               u'Nombre del albergue': 'nombre_albergue',
               u'¿Qué ofrece el albergue?': 'albergue',
               u'Sólo si lo necesitas, escribe un breve comentario': 'comentario',
               }
    dtypes = {u'Submission ID': int,
              u'¿Dónde estás?': str,
              u'Tipo de Reporte': str,
              u'¿Qué actividades realizará el voluntario?': str,
              u'Describe brevemente el estado de tu inmueble': str,
              u'¿Qué necesitas acopiar?': str,
              u'¿Qué necesita el hospital?': str,
              u'Escribe brevemente qué necesitas': str,
              u'¿Cuántos Voluntarios Necesitas?': str,
              u'Nombre del albergue': str,
              u'¿Qué ofrece el albergue?': str,
              u'Sólo si lo necesitas, escribe un breve comentario': str,
              }

    url = 'https://www.jotform.com/csv/72647940607059'
    if os.path.exists('survey.csv'):
        os.rename('survey.csv','survey.bk')
    reporte = wget.download(url,'survey.csv')
    os.rename('survey.csv', 'survey.csv.bak')
    with open('survey.csv.bak', 'rU') as infile, open('survey.csv', 'w') as outfile:
       for line in infile:
           outfile.write(line.replace('\r\n',''))
    os.remove('survey.csv.bak')

    df = pd.read_csv(reporte, encoding='utf-8', parse_dates=['Submission Date'], dtype=dtypes,
                     na_values=[''])
    df.rename(columns=columns, inplace=True)
    df.replace(np.nan, ' ')
    df.replace('\n','',regex=True)
    cols = ['actividad', 'inmueble', 'acopio', 'hospital', 'necesita',
            'voluntarios', 'albergue', 'comentario']  # , 'nombre_albergue'
    df = df.where((pd.notnull(df)), '')
    for i, row in df.iterrows():
        for c in cols:
            if not isinstance(df.loc[i, c], float):
                df.loc[i, c] = ud.unidecode_expect_nonascii(df.loc[i, c])
    df.loc[:, 'lon'] = df.geo.str.extract('(\d+\.\d+).*(-\d+\.\d+)')[1]
    df.loc[:, 'lat'] = df.geo.str.extract('(\d+\.\d+).*(-\d+\.\d+)')[0]
    df.loc[:, 'store_point'] = 'POINT (' + df.lon + ' ' + df.lat + ')'
    df.loc[:, 'suc'] = ''

    for i, d in df.iterrows():
        s = str(d['timestamp']) + ' '
        if str(d['tipo'].encode('utf-8')) == 'Acopio o Solicitud in situ':
            s += 'Se necesita: ' + ' '.join([str(d['acopio'].encode('utf-8')), str(d['necesita'].encode('utf-8'))])
        elif str(d['tipo'].encode('utf-8')) == 'Acopio Hospital':
            s += 'Se necesita: ' + ' '.join([str(d['hospital']), str(d['necesita'])])
        elif str(d['tipo'].encode('utf-8')) == 'Requiero Voluntarios':
            s += 'Se necesitan ' + str(d['voluntarios']) + \
                ' voluntarios para realizar: ' + str(d['necesita'])
        elif str(d['tipo'].encode('utf-8')) == 'Dar de Alta Albergue':
            s += str(d['nombre_albergue']) + ' - características: ' + str(d['albergue'])
        elif str(d['tipo'].encode('utf-8')) == 'Dar de Alta Derrumbe':
            s += '<br>Derrumbe'
        elif str(d['tipo'].encode('utf-8')) == 'Dar de Alta Daños':
            s += '<br>Daño'
        elif str(d['tipo'].encode('utf-8')) == 'Requiero de Revisión en mi Inmueble':
            s += 'Descripción de daños: ' + str(d['inmueble'])
        s = s.replace('nan', '')
        df.loc[i, 'suc'] = str(df.loc[i, 'suc']) + str(s) + ' ' + str(df.loc[i,'comentario'])
        #print df.loc[i,'suc']
    df.to_csv('db_jot.csv', encoding='utf-8')
    def __init__(self, long_string):
        """
        Create document instance for analysis.

        Opens and reads document to string raw_text.
        Textract interprets the document format and
        opens to plain text string (docx, pdf, odt, txt)

        Args:
            text (str): string to anaylze.


        Public attributes:
        -user: (str) optional string to set username.
        -path: (str) relative path to document.
        -abs_path: (str) the absolute path to the document.
        -file_name:  (str) the file name with extension of document (base
        name).
        -mime:  tbd
        -guessed_type:  makes best guess of mimetype of document.
        -file_type:  returns index[0] from guessed_type.
        -raw_text:  (str) plain text extracted from .txt, .odt, .pdf, .docx,
        and .doc.
        -ptext:  (str) raw text after a series of regex expressions to
        eliminate special characters.
        -text_no_feed:  (str) ptext with most new line characters eliminated
        /n/n stays intact.
        -sentence_tokens:  list of all sentences in a comma separated list
        derived by nltk.
        -sentence_count:  (int) count of sentences found in list.
        -passive_sentences:  list of passive sentences identified by the
        passive module.
        -passive_sentence_count:  count of the passive_sentences list.
        -percent_passive:  (float) ratio of passive sentences to all sentences
        in percent form.
        -be_verb_analysis:  (int) sum number of occurrences of each to be verb
        (am, is, are, was, were, be, being been).
        -be_verb_count: tbd
        -be_verb_analysis: tbd
        -weak_sentences_all:  (int) sum of be verb analysis.
        -weak_sentences_set:  (set) set of all sentences identified as
        having to be verbs.
        -weak_sentences_count:  (int) count of items in weak_sentences_set.
        -weak_verbs_to_sentences:  (float) proportion of sentences with to
        be to all sentences in percent (this might not be sound).
        -word_tokens:  list of discreet words in text that breaks
        contractions up (default nltk tokenizer).
        -word_tokens_no_punct:  list of all words in text including
        contractions but otherwise no punctuation.
        -no_punct:  (str) full text string without sentence punctuation.
        -word_tokens_no_punct:  uses white-space tokenizer to create a list
        of all words.
        -readability_flesch_re:  (int) Flesch Reading Ease Score (numeric
        score) made by textstat module.
        -readability_smog_index:  (int) grade level as determined by the
        SMOG algorithum made by textstat module.
        -readability_flesch_kincaid_grade:  (int)  Flesch-Kincaid grade level
        of reader made by textstat module.
        -readability_coleman_liau_index:  (int) grade level of reader as made
        by textstat module.
        -readability_ari:  (int) grade leader of reader determined by
        automated readability index algorithum implemented by textstat.
        -readability_linser_write:  FIX SPELLING grade level as determined
        by Linsear Write algorithum implemented by textstat.
        -readability_dale_chall:  (int) grade level based on Dale-Chall
        readability as determined by textstat.
        -readability_standard:  composite grade level based on readability
        algorithums.
        -flesch_re_key:  list for interpreting Flesch RE Score.
        -word_count:  word count of document based on white space tokener,
        this word count should be used.
        -page_length:  (float) page length in decimal format given 250
        words per page.
        -paper_count:  (int) number of printed pages given 250 words per
        page.
        -parts_of_speech:  words with parts of speech tags.
        -pos_counts:  values in word, tag couple grouped in a list (Counter).
        -pos_total:  (int) sum of pos_counts values
        -pos_freq:  (dict) word, ratio of whole
        -doc_pages:  (float) page length based on 250 words per page
        (warning, this is the second time this attribute is defined).
        -freq_words:  word frequency count not standardized based on the
        correct word tokener (not ratio, just count).
        modal_dist:  count of auxillary verbs based on word_tokens_no_punct.
        sentence_count (int): Count the sentence tokens
        passive_sentences (list): List of all sentences identified as passive
        passive_sentence_count (int): count of items in passive_sentences
        be_verb_count (int): count "to be" verbs in text
        word_tokens_no_punct (list): words separated, stripped of punctuation,
         made lower case
        flesch_re_key (str): reading ease score to description
        freq_words (list or dict): frequency distribution of all words
        modal_dist (list): frequency distribution of aux verbs
        """
        self.raw_text = long_string
        self.raw_text = unidecode.unidecode_expect_nonascii(self.raw_text)
        self.user = ""
        self.time_stamp = self.timestamp()
        self.ptext = re.sub('[\u201c\u201d]', '"', self.raw_text)
        self.ptext = re.sub("\u2014", "--", self.ptext)
        self.ptext = re.sub(",", ",", self.ptext)
        self.ptext = re.sub("—", "--", self.ptext)
        self.ptext = re.sub("…", "...", self.ptext)
        self.text_no_feed = self.clean_new_lines(self.ptext)
        self.sentence_tokens = self.sentence_tokenize(self.text_no_feed)
        self.sentence_count = len(self.sentence_tokens)
        self.passive_sentences = passive(self.text_no_feed)
        self.passive_sentence_count = len(self.passive_sentences)
        self.percent_passive = (
            100 *
            (float(self.passive_sentence_count) / float(self.sentence_count)))
        self.percent_passive_round = round(self.percent_passive, 2)
        self.be_verb_analysis = self.count_be_verbs(self.sentence_tokens)
        self.be_verb_count = self.be_verb_analysis[0]
        self.weak_sentences_all = self.be_verb_analysis[1]
        self.weak_sentences_set = set(self.weak_sentences_all)
        self.weak_sentences_count = len(self.weak_sentences_set)
        self.weak_verbs_to_sentences = 100 * float(
            self.weak_sentences_count) / float(self.sentence_count)
        self.weak_verbs_to_sentences_round = round(
            self.weak_verbs_to_sentences, 2)
        self.word_tokens = self.word_tokenize(self.text_no_feed)
        self.word_tokens_no_punct = \
        self.word_tokenize_no_punct(self.text_no_feed)
        self.no_punct = self.strip_punctuation(self.text_no_feed)
        # use this! It make lower and strips symbols
        self.word_tokens_no_punct = self.ws_tokenize(self.no_punct)

        self.readability_flesch_re = \
            textstat.flesch_reading_ease(self.text_no_feed)
        self.readability_smog_index = \
            textstat.smog_index(self.text_no_feed)
        self.readability_flesch_kincaid_grade = \
            textstat.flesch_kincaid_grade(self.text_no_feed)
        self.readability_coleman_liau_index = \
            textstat.coleman_liau_index(self.text_no_feed)
        self.readability_ari = \
            textstat.automated_readability_index(self.text_no_feed)
        self.readability_linser_write = \
            textstat.linsear_write_formula(self.text_no_feed)
        self.readability_dale_chall = \
            textstat.dale_chall_readability_score(self.text_no_feed)
        self.readability_standard = \
            textstat.text_standard(self.text_no_feed)
        self.flesch_re_desc_str = self.flesch_re_desc(
            int(textstat.flesch_reading_ease(self.text_no_feed)))
        self.polysyllabcount = textstat.polysyllabcount(self.text_no_feed)
        self.lexicon_count = textstat.lexicon_count(self.text_no_feed)
        self.avg_syllables_per_word = textstat.avg_syllables_per_word(
            self.text_no_feed)
        self.avg_sentence_per_word = textstat.avg_sentence_per_word(
            self.text_no_feed)
        self.avg_sentence_length = textstat.avg_sentence_length(
            self.text_no_feed)
        self.avg_letter_per_word = textstat.avg_letter_per_word(
            self.text_no_feed)
        self.difficult_words = textstat.difficult_words(self.text_no_feed)
        self.rand_passive = self.select_random(self.passive_sentence_count,
                                               self.passive_sentences)
        if self.weak_sentences:
            self.rand_weak_sentence = self.select_random(
                len(self.weak_sentences), self.weak_sentences)
        if self.word_tokens_no_punct:
            self.word_count = len(self.word_tokens_no_punct)
            self.page_length = float(self.word_count) / float(250)
            self.paper_count = int(math.ceil(self.page_length))
            self.parts_of_speech = pos_tag(self.word_tokens_no_punct)
            self.pos_counts = Counter(tag
                                      for word, tag in self.parts_of_speech)
            self.pos_total = sum(self.pos_counts.values())
            self.pos_freq = dict(
                (word, float(count) / self.pos_total)
                for word, count in list(self.pos_counts.items()))
            self.doc_pages = float(float(self.word_count) / float(250))
            self.freq_words = \
                self.word_frequency(self.word_tokens_no_punct)
            self.modal_dist = self.modal_count(self.word_tokens_no_punct)
            # self.ws_tokens = self.ws_tokenize(self.text_no_cr)
            self.pos_count_dict = list(self.pos_counts.items())

            # Model - use for any pos
            self.modals = self.pos_isolate('MD', self.pos_count_dict)
            self.preposition_count = self.pos_isolate('IN',
                                                      self.pos_count_dict)
            self.adjective_count = self.pos_isolate_fuzzy(
                'JJ', self.pos_count_dict)
            self.adverb_count = self.pos_isolate_fuzzy('RB',
                                                       self.pos_count_dict)
            self.proper_nouns = self.pos_isolate_fuzzy('NNP',
                                                       self.pos_count_dict)
            self.cc_count = self.pos_isolate('CC', self.pos_count_dict)
            self.commas = self.char_count(",")
            self.comma_sentences = self.list_sentences(",")
            self.comma_example = self.select_random(len(self.comma_sentences),
                                                    self.comma_sentences)
            self.semicolons = self.char_count(";")
            if self.semicolons:
                self.semicolon_sentences = self.list_sentences(";")
                self.semicolon_example = self.select_random(
                    len(self.semicolon_sentences), self.semicolon_sentences)
            self.lint_suggestions = lint(self.raw_text)
Exemplo n.º 24
0
def merge_review_jsons(folder_name):
    path = '../../data/raw/' + folder_name

    # loop over files in the folder, merge the dataframes
    for fname in os.listdir(path):
        full_path = path + '/' + fname
        tmp = pd.read_json(full_path)
        if 'df' not in vars():
            df = tmp
        else:
            df = pd.concat([df, tmp])

    # each entry is a dictionary, only length 3 was populated with data from
    # the scraper
    num_els = df['results'].apply(lambda x: len(x))

    df = df[num_els == 3]

    # create empty lists, to be appended to by expanding each dictionary
    names_list = []
    ids_list = []
    reviews_list = []

    # loop over each row, open the dictionary to get all reviews associated
    # with each wine...then create lists of the reviews, corresponding names,
    # and ids. then create df from the lists
    for i in range(df.shape[0]):
        entry = df['results'].iloc[i]

        # decode unicode chars
        name = unidecode_expect_nonascii(entry['wine_name'])

        # split wine id from url
        wine_id = int(entry['wine_name_url'].split('?iWine=')[1])

        # get dictionary of wine reviews for each wine
        reviews = entry['wine_reviews']

        n_reviews = len(reviews)

        # append the names and ids to each list n_reviews times
        # when we create dataframe these will then be in proper rows associated
        # with each review
        names_list += [name] * n_reviews
        ids_list += [wine_id] * n_reviews

        # loop over the reviews, decode the unicode chars, then append to list
        for j in range(n_reviews):
            reviews_list.append(unidecode_expect_nonascii(reviews[j]['name']))

    # create dataframe, rename columns
    unpacked = pd.DataFrame(zip(names_list, ids_list, reviews_list))
    unpacked.columns = ['wine_name', 'id', 'review_text']

    # remove rows with no review text
    not_null = ~unpacked['review_text'].isnull()

    unpacked = unpacked[not_null]

    # drop duplicates
    unpacked.drop_duplicates(inplace=True)

    return unpacked
Exemplo n.º 25
0
def DownloadPageHistory(browser, historyRoot, pageName, justUpdate):

    # Open the Fancy 3 page in the browser
    browser.get("http://fancyclopedia.org/" + pageName + "/noredirect/t")

    # Get the first two letters in the page's name
    # These are used to disperse the page directories among many directotoes so as to avoid having so many subdirectores that Windows Explorer breaks when viewing it
    d1 = pageName[0]
    d2 = d1
    if len(pageName) > 1:
        d2 = pageName[1]

    # Check to see what we have already downloaded.
    # Any history already downloaded will be in historyRoot/d1/d2/pageName+nnnn, where nnnn is the version number
    # Read historyRoot/d1/d2 and make a list of the version number of all directories found
    # The version directories are named Vnnnn
    pagePath = os.path.join(historyRoot, d1, d2, pageName)
    existingVersions = []
    lowestVersionNeeded = 0
    if os.path.exists(pagePath):
        existingVersions = [
            entry for entry in os.scandir(pagePath) if entry.is_dir()
        ]  # All the subdirectory objects
        existingVersions = [entry.name for entry in existingVersions
                            ]  # Make into a list of subdirectory names
        existingVersions = [
            entry[1:] for entry in existingVersions
            if entry[0] == 'V' and len(entry) == 5 and entry[1].isdigit() and
            entry[2].isdigit() and entry[3].isdigit() and entry[4].isdigit()
        ]
        existingVersions = [int(entry)
                            for entry in existingVersions]  # Convert to number
        # Now figure out what the lowest version still needed is. (Knowing this may allow us to optimize page loads.)
        if len(existingVersions) > 0:
            lowestVersionNeeded = max(existingVersions) + 1
            i = 0
            while i < max(existingVersions) + 1:
                if i not in existingVersions:
                    lowestVersionNeeded = i  # Mote that this will be max+1 if there are no gaps in the list
                    break
                i = i + 1

    print("   First version needed: " + str(lowestVersionNeeded))
    # Page found?
    errortext = "The page <em>" + pageName.replace(
        "_", "-") + "</em> you want to access does not exist."
    if errortext in browser.page_source:
        print("*** Page does not exist: " + pageName)
        return

    # Find the history button and press it
    browser.find_element_by_id('history-button').send_keys(Keys.RETURN)
    time.sleep(0.5)  # Just-in-case

    # Wait until the history list has loaded
    try:
        WebDriverWait(browser, 10).until(
            EC.presence_of_element_located((By.ID, 'revision-list')))
    except:
        print(
            "***Oops. Exception while waiting for the history list to load in "
            + pageName + ":  Retrying")
        WebDriverWait(browser, 10).until(
            EC.presence_of_element_located((By.ID, 'revision-list')))

    # Step over the pages of history lines (if any)
    # The pages are a series of spans creating a series of boxes, each with a number in it.  There is a box labeled "current" and we want to click on the *next* box.
    firstTime = True
    terminate = False
    while not terminate:

        # There may be a "pager" -- a series of buttons to show successive pages of history
        try:
            pagerDiv = browser.find_element_by_xpath(
                '//*[@id="revision-list"]/div')
        except SeEx.NoSuchElementException:
            pagerDiv = None
        except:
            print("***Oops. Exception while looking for pager div in " +
                  pageName)
            return

        if pagerDiv == None and not firstTime:
            break

        # If there are multiple pages of history, then before starting the second and subsequent loops, we need to go to the next page
        if pagerDiv != None and not firstTime:
            # Find the current page indicator
            els = pagerDiv.find_elements_by_tag_name("span")
            # And click the *next*, if any, to go to the next page
            for i in range(0, len(els)):
                if els[i].get_attribute("class") == "current":
                    if i + 1 < len(els):
                        els[i + 1].find_element_by_tag_name("a").send_keys(
                            Keys.RETURN)
                    else:
                        terminate = True
                    break
            if terminate:
                break

        firstTime = False

        # Get the history list
        historyElements, id = ExtractHistoryList(browser)

        # Note that the history list is from newest to oldest, but we don't care since we traverse them all
        # The structure of a line is
        #       The revision number followed by a "."
        #       A series of single letters (these letters label buttons)
        #       The name of the person who updated it
        #       The date
        #       An optional comment
        # This calls for a Regex
        rec = Regex.compile(
            "^"  # Start at the beginning
            "(\d+)."  # Look for a number at least one digit long followed by a period and space
            "( [A-UW-Z]| [A-Z] [A-UW-Z]|)?"  # Look for a single capital letter or two separated by spaces or this could be missing
            # We skip the V as the final letter to avoid conflict with the next pattern
            "( V S R | V S )"  # Look for either ' V S ' or ' V S R '
            "(.*)"  # Look for a name
            "(\d+ [A-Za-z]{3,3} 2\d{3,3})"  # Look for a date in the 2000s of the form 'dd mmm yyyy'
            "(.*)$")  # Look for an optional comment

        #TODO: Don't load earlier history pages unless we need to.
        i = 0
        while i < len(
                historyElements
        ):  # We do this kludge because we need to continually refresh historyElements. While it may become stale, at least it doesn't change size
            # Regenerate the history list, as it may have become stale
            # This while loop, et al, is to allow retries since sometimes it doesn't seem to load in time
            historyElements = None
            count = 0
            gps = None
            while (historyElements == None or gps is None) and count < 5:
                try:
                    historyElements = browser.find_element_by_xpath(
                        '//*[@id="revision-list"]/table/tbody'
                    ).find_elements_by_xpath("tr")
                    time.sleep(0.1)
                    id = historyElements[i + 1].get_attribute("id").replace(
                        "revision-row-", ""
                    )  # This code is here just to trigger an exception if not loaded fully
                    t = historyElements[
                        i +
                        1].text  # This code is here just to trigger an exception if not loaded fully
                    historyElements = historyElements[
                        1:]  # The first row is column headers, so skip them.

                    el = historyElements[i]
                    id = el.get_attribute("id").replace("revision-row-", "")
                    t = el.text
                    # print("t='"+t+"'")
                    m = rec.match(t)
                    gps = m.groups()
                    if gps == None:
                        print("***gps is None")
                    if len(gps) < 5:
                        print("***gps is too short")
                    user = gps[3]
                except Exception as exception:
                    # Wait and try again
                    time.sleep(1)
                    count = count + 1
                    print("... Retrying historyElements(2): " +
                          type(exception).__name__ + "  count=" + str(count))
            if historyElements == None and count >= 5:
                print("***Could not get historyElements(2) after five tries.")
            if gps == None:
                print("***gps is None (2)")

            # Get the revision number.  Skip it if it's in the list of existing revisions
            revNum = gps[0]
            if int(revNum) not in existingVersions:

                # The Regex greedy capture of the user name captures the 1st digit of 2-digit dates.  This shows up as the user name ending in a space followed by a single digit.
                # Fix this if necessary
                user = gps[3]
                date = gps[4]
                if user[-2:-1] == " " and user[-1:].isdigit():
                    date = user[-1:] + gps[4]
                    user = user[:-2]

                # Click on the view source button for this row
                el.find_elements_by_tag_name(
                    "td")[3].find_elements_by_tag_name("a")[1].click()
                # This while loop, et al, is to allow retries since sometimes it doesn't seem to load in time
                divRevList = None
                count = 0
                while divRevList == None and count < 5:
                    try:
                        divRevList = browser.find_element_by_xpath(
                            '//*[@id="revision-list"]/table/tbody')
                    except SeEx.NoSuchElementException:
                        # Wait and try again
                        time.sleep(1)
                        count = count + 1
                if divRevList == None and count >= 5:
                    print("***Could not get divRevList after five tries.")

                source = None
                count = 0
                while source == None and count < 5:
                    try:
                        source = divRevList.find_element_by_xpath(
                            '//*[@id="history-subarea"]/div').text
                    except (SeEx.NoSuchElementException,
                            SeEx.StaleElementReferenceException):
                        # Wait and try again
                        time.sleep(1)
                        count = count + 1
                if source == None and count >= 5:
                    print("***Could not get source after five tries.")
                del divRevList

                # Write out the xml data
                root = ET.Element("data")
                el = ET.SubElement(root, "number")
                number = str(gps[0])
                el.text = number
                el = ET.SubElement(root, "ID")
                el.text = str(id)
                el = ET.SubElement(root, "type")
                el.text = str(gps[1])
                el = ET.SubElement(root, "name")
                el.text = str(user)
                el = ET.SubElement(root, "date")
                el.text = str(date)
                el = ET.SubElement(root, "comment")
                el.text = str(gps[5])
                # And write the xml out to file <localName>.xml.
                tree = ET.ElementTree(root)

                # OK, we have everything.  Start writing it out.

                # Make sure the target directory exists
                seq = ("0000" + number)[-4:]  # Add leading zeroes
                dir = os.path.join(pagePath, "V" + seq)
                pathlib.Path(dir).mkdir(parents=True, exist_ok=True)

                print("    Loaded V" + seq)

                # Write the directory contents
                tree.write(os.path.join(dir, "metadata.xml"))
                with open(os.path.join(dir, "source.txt"), 'a') as file:
                    file.write(unidecode.unidecode_expect_nonascii(source))

            i = i + 1

        # The history pages are loaded recent (highest version number) first.
        # Check to see if subsequent pages could *possibly* have a version we need.
        # If not, end the loop over pages of history lists.
        if int(revNum) < lowestVersionNeeded:
            break

    # Download the files currently attached to this page
    # Find the files button and press it
    #TODO: Avoid downloading files which already have been downloaded.
    elem = browser.find_element_by_id('files-button')
    elem.send_keys(Keys.RETURN)
    time.sleep(0.7)  # Just-in-case

    # Wait until the history list has loaded
    wait = WebDriverWait(browser, 10)
    #wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'page-files')))
    try:
        els = browser.find_element_by_class_name(
            "page-files").find_elements_by_tag_name("tr")
        for i in range(1, len(els)):
            h = els[i].get_attribute("outerHTML")
            url, linktext = Helpers.GetHrefAndTextFromString(h)
            urllib.request.urlretrieve(
                "http://fancyclopedia.org" + url,
                os.path.join(os.path.join(pagePath, linktext)))
        print("      " + str(len(els) - 1), " files downloaded.")
    except:
        k = 0

    # Update the donelist
    with open(os.path.join(historyRoot, "donelist.txt"), 'a') as file:
        file.write(pageName + "\n")

    return
def scrape_url():
    #num1 = randint(1, 10)
    #num2 = randint(1, 10)
    if request.method == 'POST':
        if request.form['email_add']:
            return render_template('404.html')
        #if (request.form['userSolution'] != request.form['sum']):
        #    return redirect(request.url)
        result = request.form
        if 'plaintext' not in result:
            return redirect(request.url)
        #if 'author' not in result:
        #    author = ""
        url = result['plaintext']
        target = urllib.request.Request(url)
        target.add_header('Accept-Encoding', 'utf-8')
        target.add_header(
            'User-Agent',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'
        )
        response = urllib.request.urlopen(target)
        #soup = BS(response.read().decode('utf-8'), convertEntities=BS.HTML_ENTITIES)
        #soup = BS(response.read().decode('utf-8', 'ignore'), convertEntities=BS.HTML_ENTITIES)
        soup = BeautifulSoup(response, 'html.parser')
        # except urllib2.HTTPError, e:
        #     print('We failed with error code %s' % e.code)
        #     if e.code == 404:
        #         render_template('404.html')
        #     elif e.code == 403:
        #         render_template('403.html')
        #     else:
        #         pass
        #paragraphs = ""
        #for s in soup.findAll('br'):
        #    paragraphs += s.get_text(separator=" ", strip=True)
        paragraphs = soup.findAll('p')
        #title = soup.find('title')
        title = soup.title.string
        #h = HTMLParser()
        #paragraphs = h.unescape(paragraphs)
        #title = title.getText()
        #title = h.unescape(title)
        plaintext = ""
        for p in paragraphs:
            plaintext += p.text + "\n\n"
            #plaintext += p.getText() + '\n\n'
            #plaintext += p.getText(" ") + '\n\n'

        prose = unidecode_expect_nonascii(plaintext)
        #prose = paragraphs
        apis = result['apis']
        author = result['author']
        administrator = result['administrator']
        admin_notes = result['notes']

        global Doc

        Doc = read_document.Sample(prose, author, apis)
        Doc.administrator = administrator
        Doc.admin_notes = admin_notes
        if title:
            Doc.title = title
        else:
            Doc.title = url
        if Doc:
            return redirect(url_for('feedback', timestamp=Doc.title))
        else:
            return render_template('url.html')
    else:
        return render_template('url.html')
Exemplo n.º 27
0
 def __call__(self, text):
     try:
         from unidecode import unidecode_expect_nonascii
     except ImportError:
         raise ImportError('Please install package `unidecode`')
     return unidecode_expect_nonascii(text)
Exemplo n.º 28
0
 def run(self):
     global allfiles
     global thread_count
     global pathq
     empty = False
     filesize = 0
     path, size, filetype, modify_time, create_time, access_time, scan_time, uid, gid = [], [], [], [], [], [], [], [], []
     while True:  
         filelist = []
         try:
             scanpath = pathq.get()
             logging.info('start at: '+start+ "\n")
             logging.info(scanpath+ "\n")
             if os.path.isdir(scanpath):
                 filelist = os.listdir(scanpath)
             else:
                 print(scanpath+'is not '+'valid path')
                 continue
             if len(filelist) > 100000:
                 print(scanpath+' has: '+str(len(filelist))+' files'+', do not scan now')
                 continue
             for file in filelist:
                 filename = os.path.join(scanpath,file)
                 if os.path.exists(filename):
                     if os.path.isdir(filename):
                         pathq.put(filename)
                     elif os.path.isfile(filename):
                         lock.acquire()
                         allfiles += 1
                         print(allfiles)
                         lock.release()
                         fileinfo = os.stat(filename)
                         size.append(fileinfo.st_size)
                         filetype.append(fileinfo.st_type)
                         modify_time.append(self.timetos(fileinfo.st_mtime))
                         create_time.append(self.timetos(fileinfo.st_ctime))
                         access_time.append(self.timetoint(fileinfo.st_atime))
                         scan_time.append(datetime.datetime.now())
                         uid.append(fileinfo.st_uid)
                         gid.append(fileinfo.st_gid)
                         filename = unidecode.unidecode_expect_nonascii(filename)
                         path.append(filename)
                         filesize += 1
                         if filesize == 50000:
                             print("Write InfluxDB...")
                             df = pd.DataFrame({"path" : path,"size": size,"filetype": filetype,"modify_time": modify_time,"create_time": create_time,"access_time" : access_time,
                                                "scan_time": scan_time,"uid": uid,"gid": gid}, index=pd.date_range(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), freq='U', periods=len(path)))
                             client.write_points(df, 'infos', protocol='json')
                             print("Write Done")
                             logging.info('start at: '+start)
                             logging.info('end at: '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
                             logging.info("filesize: %s" % allfiles)
                             path, size, filetype, modify_time, create_time, access_time, scan_time, uid, gid = [], [], [], [], [], [], [], [], []
                             filesize = 0                
         except Queue.Empty as e:
             empty = True
             print("Write InfluxDB...")
             df = pd.DataFrame({"path" : path,"size": size,"filetype": filetype,"modify_time": modify_time,"create_time": create_time,"access_time" : access_time,
                                                "scan_time": scan_time,"uid": uid,"gid": gid}, index=pd.date_range(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), freq='U', periods=len(path)))
             client.write_points(df, 'infos', protocol='json')
             print("Write Done")
             logging.info('start at: '+start)
             logging.info('end at: '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
             logging.info("filesize: %s" % allfiles)
             path, size, filetype, modify_time, create_time, access_time, scan_time, uid, gid = [], [], [], [], [], [], [], [], []                 
             print 'scanning end, now you can enter ctrl+c to stop this program'
             break   
         except OSError as e:
             print 'OSError:', e
             continue
         except Exception as e:
             print 'Exception:', e
             continue
         finally:
             if not empty:
                 pathq.task_done()
Exemplo n.º 29
0
#val = int(val)
val =1

host = 'localhost'
port = 27017

print('We are in dbConnection.py')
try:
    connection = MongoClient('localhost', 27017)
    db = connection.core
    collection = db.rss_feed_entry
    for x in collection.find({}, {"_id": 0, "title": 1,"newsCategory":1}):
        if(x['newsCategory']=="Emerging Threats and Cyberattacks" or x['newsCategory']== "Cyber Hacks and Incident"
        or x['newsCategory']=="Threat Actors and Tools"):
            print(x['newsCategory'])
            dbtext = (dbtext + (unidecode.unidecode_expect_nonascii(x['title'])))
    dbtext1 = sent_tokenize(dbtext)
   # dbtext1 = list(dbtext.split(" "))
   # print('dbtext->', dbtext1)
    print("Connected successfully!!!")
except Exception as e:
    print(e)

# Preprocessing
def remove_string_special_characters(s):
    # removes special characters with ' '
    stripped = re.sub('[^a-zA-z\s]', '', s)
    stripped = re.sub('_', '', stripped)
    # Change any white space to one space
    stripped = re.sub('\s+', ' ', stripped)