Exemplos de clean em Python, exemplos de cleaner.clean em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: utility.py Projeto: dcole-tc/threatconnect-developer-docs

def init_tcex(requires_tc_token=False):
    """Initialize the tcex instance."""
    tcex_instance = tcex.TcEx()
    tcex_instance.log.debug('Creating content in {}. If this is not correct, pass in a different owner name using the --api_default_org flag.'.format(tcex_instance.args.api_default_org))
    tcex_instance.args.api_access_id = os.environ['API_ACCESS_ID']
    tcex_instance.args.tc_temp_path = 'log'
    # this manually sets the logging level
    tcex_instance.log.setLevel(logging.DEBUG)
    tcex_instance.args.tc_log_path = 'log'
    tcex_instance.args.tc_out_path = 'log'
    tcex_instance.args.tc_api_path = os.environ['TC_API_PATH']
    tcex_instance.args.api_default_org = os.environ['API_DEFAULT_ORG']
    tcex_instance.args.api_secret_key = os.environ['API_SECRET_KEY']

    if requires_tc_token:
        if os.environ.get('TC_TOKEN'):
            tcex_instance.args.tc_token = os.environ['TC_TOKEN']
            # parse the expiration timestamp from the tc_token
            tcex_instance.args.tc_token_expires = tcex_instance.args.tc_token.split(':')[4]
        # if the request requires a token and a token is not found, raise an error
        else:
            raise RuntimeError('The TC_TOKEN environmental variable is required and was not found. Please add it (you can find instructions for doing so here: https://gitlab.com/fhightower-tc/tcex-playground#setup).')

    # clear out any data in the source
    cleaner.clean(tcex_instance)
    validator.validate(tcex_instance, expected_groups=0, expected_indicators=0)

    return tcex_instance

Exemplo n.º 2

0

Exibir arquivo

Arquivo: meshctl.py Projeto: facebeef377/meshctl

 def post(self):
     if(ifConnected == 1):
         bl.quit_mesh()
     conn = db_connect.connect()
     query = conn.execute("delete from devices;")
     cleaner.clean()
     return {'status' : 'OK'}

Exemplo n.º 3

0

Exibir arquivo

def main():
    creds = config.get_creds()
    sftp.download(creds.get("sftp_url"), creds.get("sftp_username"),
                  creds.get("sftp_password"), creds.get("localpath"))
    cleaner.clean(creds.get("localpath"))
    merge.merge(creds.get("localpath"))
    scraper.scrape(creds)

Exemplo n.º 4

0

Exibir arquivo

    def get(self, chinese):

        input = [chinese, '']
        input = clean(input)
        prepare = vectorizer.transform(clean(input))
        pred = clf.predict(prepare)
        level = np.asscalar(np.int16(pred[0]))

        result = {chinese: level}
        return result

Exemplo n.º 5

0

Exibir arquivo

Arquivo: app.py Projeto: YechielK/auto-eda

def logistic_regression():

    if request.method == 'POST':
        # if user presses submit after uploading dataset and target
        if 'file' in request.files and 'target' in request.form:
            file = request.files['file']
            session['filename'] = file.filename
            data = pd.read_csv(file)
            data = cleaner.clean(data)
            session['target'] = request.form['target']
            session['target'] = cleaner.fix_target(session['target'])

        # if user only needs to uplaod target and presses submit
        elif 'get_target' in request.form:

            session['target'] = request.form['get_target']
            session['target'] = cleaner.fix_target(session['target'])
            data = pd.read_pickle(session['filename'])

        # perform linear regression
        logreg_model = logreg.logreg(data, session['target'])
        return render_template('linearreg.html')

    # if user needs to uplaod csv and target
    if request.method == 'GET':
        if not os.path.isfile(session['filename']):
            return render_template('linearreg.html')
        else:
            # if user only needs to upload target
            return render_template('linearreg.html', get_target=True)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: core.py Projeto: hans/forvo-downloader

def main():
    config = parse_config(sys.argv)
    results = do_search(config)

    if len(results) == 0:
        print('No results found.')
        sys.exit(1)
    elif len(results) == 1:
        result = results[0]
    else:
        result = do_disambiguate(results)

    filename = do_download(result)
    print('Saved pronuncuation to ./{}'.format(filename))

    if config['clean']:
        print('Cleaning..')

        username = result['standard_pronunciation']['username']
        profile = cleaner.find_noise_profile(username)

        if profile is None:
            print('No noise profile exists for {}. '
                  'We will try to create one.'.format(username))

        cleaned_filename, new_profile = cleaner.clean(filename, username,
                                                      noise_profile=profile)

        if profile is None:
            if new_profile is None:
                print('Noise profile creation aborted.')
            else:
                print('Saved new profile to {}'.format(new_profile))

        print('Cleaned pronunciation saved to ./{}'.format(cleaned_filename))

Exemplo n.º 7

0

Exibir arquivo

Arquivo: generate.py Projeto: trolldbois/ctypes-kernel

def make(sourcefile, modulename):
  import cleaner, preprocess
  if not os.access(sourcefile, os.F_OK):
    raise IOError(sourcefile)
  #sourcefile
  basename = os.path.basename(sourcefile)
  preprocessed = "%s.c"%(modulename)
  cleaned = "%s_clean.c"%(modulename)
  #xml = "%s.xml"%(modulename)
  pyfinal = "%s.py"%(modulename)
  if not os.access(pyfinal, os.F_OK):
    if not os.access(cleaned, os.F_OK):
      if not os.access(preprocessed, os.F_OK):
        # preprocess the file
        if preprocess.process(sourcefile, preprocessed) > 0:
          return
      log.info('PREPROCESS - OK')
      # clean it
      if cleaner.clean(preprocessed, cleaned) > 0:
        return
    log.info('CLEAN - OK')
    # generate yfinal
    if gen(cleaned, modulename) > 0:
      return
  log.info('PYFINAL - OK')
  __import__(modulename)
  import inspect
  nbClass = len(inspect.getmembers(sys.modules[modulename], inspect.isclass))
  nbMembers = len(inspect.getmembers(sys.modules[modulename], inspect.isclass))
  log.info("module %s has %d members for %d class"%(modulename, nbMembers, nbClass))

Exemplo n.º 8

0

Exibir arquivo

Arquivo: test_cleaner.py Projeto: Symonen/cleaner

def test_clean__rise_exception_where_no_closing_tag():
    def mock_walk(path: str):
        return [
            ('root_dir', [], ['mango.py']),
        ]

    line_processor = lambda line: None
    path = '.'
    with patch('builtins.open', mock_open(read_data=FILE_WITHOUT_CLOSING_TAG)), \
            patch('os.walk', mock_walk), \
            patch('config.FILE_EXTENSIONS_TO_PROCESS', {'.py', '.js'}), \
         patch('cleaner.save_file', MagicMock()) as save_file_mock, \
            patch('config.OPENING', '# ▼▼▼ MY TEMP CODE. DELETE ME ▼▼▼'), \
            patch('config.ENDING', '# ▲▲▲ MY TEMP CODE. DELETE ME ▲▲▲'), \
            pytest.raises(cleaner.ClosingTagNotFoundException):
        cleaner.clean(path, line_processor)
    save_file_mock.assert_not_called()

Exemplo n.º 9

0

Exibir arquivo

Arquivo: raw_to_processed.py Projeto: wynspeare/chicago-salary-data

def main(input_filepath, output_filepath):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    logger = logging.getLogger(__name__)
    logger.info('making final data set from raw data')
    data = clean(input_filepath)
    data.to_csv(output_filepath, index=False)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: document_generator.py Projeto: chengdujin/socrates

def generate(source='articles/cnbeta'):
    'combines cleaner and segmenter'
    import cleaner, segmenter
    
    documents = []
    items = cleaner.clean(source)
    documents = segmenter.segment(items)
    publish(documents, source)
    
    return documents

Exemplo n.º 11

0

Exibir arquivo

def load_docs(filepath, clean_text=True):
    ret = []
    for f_name in os.listdir(filepath):
        if clean_text:
            ret.append(
                clean(
                    open(filepath + "/" + f_name,
                         'rb').read().decode('UTF-8')))
            continue
        ret.append(open(filepath + "/" + f_name, 'rb').read().decode('UTF-8'))
    return ret

Exemplo n.º 12

0

Exibir arquivo

def recommendations():
    s = session()
    rows = s.query(News).filter(News.label == None).all()
    classified_news = []
    for i in rows:
        prediction = model.predict(clean(i.title))
        for j in range(len(prediction)):
            if prediction[j] == 'good':
                classified_news.append(i)
            else:
                break
    return template('news_recommendations', rows=classified_news)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: test_cleaner.py Projeto: Symonen/cleaner

def test_clean(line_processor_name, file_data, expected_result):
    def mock_walk(path: str):
        return [
            ('root_dir', [], ['mango.py']),
        ]

    line_processor = cleaner.LINE_PROCESSORS[line_processor_name]
    path = '.'
    with patch('builtins.open', mock_open(read_data=file_data)), \
            patch('os.walk', mock_walk), \
            patch('cleaner.save_file', MagicMock()) as save_file_mock, \
            patch('cleaner.FILE_EXTENSIONS_TO_PROCESS', {'.py', '.js'}), \
            patch('cleaner.OPENING', '# ▼▼▼ MY TEMP CODE. DELETE ME ▼▼▼'), \
            patch('cleaner.ENDING', '# ▲▲▲ MY TEMP CODE. DELETE ME ▲▲▲'):
        cleaner.clean(path, line_processor)

    expected = [l + '\n' for l in expected_result.split('\n')
                ][:-1]  # Split wihotut deleting delimiter
    assert (save_file_mock.call_args_list == [
        call('root_dir/mango.py', expected)
    ])

Exemplo n.º 14

0

Exibir arquivo

Arquivo: test_cleaner.py Projeto: Symonen/cleaner

def test_clean__include_only_defined_extensions_and_exclude_dirs():
    def mock_walk(path: str):
        return [
            ('root_dir', [],
             ['mango.py', 'bannana.js', 'raspberry.html', 'lenmon.txt']),
            ('test/excluded_dir', [], ['blackberry.py', 'cherry.js']),
        ]

    line_processor = lambda line: None
    path = '.'
    with patch('builtins.open', mock_open(read_data='Some data about fruits, irrelevant')) as mocked_open, \
            patch('os.walk', mock_walk), \
            patch('cleaner.FILE_EXTENSIONS_TO_PROCESS', {'.py', '.js'}), \
            patch('cleaner.EXCLUDE_DIRS', {'excluded_dir'}):
        cleaner.clean(path, line_processor)

    assert mocked_open.call_args_list == [
        call('root_dir/mango.py', 'r'),
        call('root_dir/mango.py', 'w'),
        call('root_dir/bannana.js', 'r'),
        call('root_dir/bannana.js', 'w'),
    ]

Exemplo n.º 15

0

Exibir arquivo

def read_emails(path):
    files = [f for f in listdir(path) if isfile(join(path, f))]

    try:
        del (files[files.index('.DS_Store')])
    except:
        pass

    reader = WordListCorpusReader(path, files)

    text = clean(reader.raw())
    emails = split_emails(text, reader.fileids())

    return emails

Exemplo n.º 16

0

Exibir arquivo

def test_indicator_associations():
    """."""
    tcex = utility.init_tcex()
    tcex.jobs.indicator({
        "summary": "4.5.6.7",
        "type": "Address",
    })
    tcex.jobs.indicator({
        "summary": "ASN1234",
        "type": tcex.safe_rt('ASN', lower=False),
    })
    tcex.jobs.association({
        'association_value': 'ASN1234',
        'association_type': tcex.safe_rt('ASN', lower=False),
        'resource_value': '4.5.6.7',
        'resource_type': 'Address'
    })
    tcex.jobs.process(tcex.args.api_default_org)
    assert len(tcex.jobs.indicator_results['failed']) == 0
    assert len(tcex.jobs.indicator_results['not_saved']) == 0
    assert len(tcex.jobs.indicator_results['saved']) == 2

    verify_association_created(tcex)
    cleaner.clean()

Exemplo n.º 17

0

Exibir arquivo

Arquivo: prepare_brown.py Projeto: as1986/lyrics_crawler

def main():
    import json, sys,cleaner, glob


    with open('brown_output', 'w') as w_fh:
        lyrics_files = glob.glob('*.lyrics')
        for each_lyr in lyrics_files:
            for each_l in open(each_lyr):
                (title, js) = each_l.strip().split('\t')
                list_lyrics = json.loads(js)
                if list_lyrics is not None and len(list_lyrics) > 0:
                    lyr = cleaner.clean(list_lyrics[0])
                    for each_lyrline in lyr:
                        to_append = ' '.join(each_lyrline.strip()).encode('utf-8'
                                                                          '')
                        w_fh.write(to_append + '\n')

    return

Exemplo n.º 18

0

Exibir arquivo

Arquivo: app.py Projeto: YechielK/auto-eda

def linearRegression():

    if request.method == 'POST':
        # if user presses submit after uploading dataset and target
        if 'file' in request.files and 'target' in request.form:
            file = request.files['file']
            session['filename'] = file.filename
            data = pd.read_csv(file)
            data = cleaner.clean(data)
            session['target'] = request.form['target']
            session['target'] = cleaner.fix_target(session['target'])

        # if user only needs to uplaod target and presses submit
        elif 'get_target' in request.form:

            session['target'] = request.form['get_target']
            session['target'] = cleaner.fix_target(session['target'])
            data = pd.read_pickle(session['filename'])

        # perform linear regression
        linreg_model = linreg.linreg(data, session['target'])
        ans = ''
        eq = g.selected_features + ' * ' + str(linreg_model.coef_[0])

        if np.sign(linreg_model.coef_) > 0:
            ans = '+ ' + eq
        else:
            ans = '- ' + eq

        return render_template('linearreg.html',
                               intercept=linreg_model.intercept_,
                               coef_name=g.selected_features,
                               coef_num=linreg_model.coef_,
                               r_squared=g.r_squared,
                               mae=g.mae,
                               eq=ans)

    # if user needs to uplaod csv and target
    if request.method == 'GET':
        if not os.path.isfile(session['filename']):
            return render_template('linearreg.html')
        else:
            # if user only needs to upload target
            return render_template('linearreg.html', get_target=True)

Exemplo n.º 19

0

Exibir arquivo

Arquivo: app.py Projeto: YechielK/auto-eda

def eda():

    if request.method == 'GET':
        return render_template('eda.html')

    if request.method == 'POST' and 'file' in request.files:
        file = request.files['file']

        session['filename'] = file.filename

        data = pd.read_csv(file)
        data = cleaner.clean(data)
        data.to_pickle(session['filename'])
        return render_template('eda.html',
                               tables=[data.head().to_html()],
                               nulls=g.nulls,
                               duplicates=g.duplicates,
                               outliers=g.outliers,
                               memory=g.memory)

Exemplo n.º 20

0

Exibir arquivo

def process(tweet,relevance):
    reload(sys)  
    sys.setdefaultencoding('utf8')
    text=tweet.text.encode('utf-8',errors='ignore')
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False)
    entity_names=entity_locs=entity_per=[]
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))
        entity_locs.extend(extract_entity_logs(tree))
        entity_per.extend(extract_entity_pers(tree)) 

	a=Scholarship()
    a.tweet_id=tweet.id
    a.created_at=tweet.created_at
    a.tweet_lang=tweet.lang 
    a.user_name=tweet.user.screen_name
    if(tweet.place!=None):
        a.user_country=tweet.place.country
    a.text=clean(tweet.text)
    a.scholarship_name=return_scholarship_name(entity_names,entity_locs,entity_per)
    a.university=return_university_name(text)
    a.deadline=return_deadline(text)
    a.category=return_category(text)
    a.info_url=return_urls(tweet)
    context=find_place(a.university,entity_locs)
    a.country=return_country(context['lng'],context['lat'])
    a.longitude=context['lng']
    a.latitude=context['lat']
    if(a.scholarship_name!=None and a.longitude!=None and a.latitude!=None):
        a.markerName=preprocess_str(a.scholarship_name)+ ' , '+context['place']
        a.markerType =context['type']
    else:
        a.markerType=a.markerName=None
    a.relevant =relevance
    try:
        a.save()
    except Exception as e:
        print "there is a problem ",e

Exemplo n.º 21

0

Exibir arquivo

def main(caption):

    model = Word2Vec.load('w2v/word2vec.bin')
    text = clean(caption, lemmatize=True, stop_words=True)
    sw = set(stopwords.words('English'))
    hashtags = []
    for i in text:

        try:
            hashtag = model.wv.most_similar(i)

        except KeyError:
            hashtag = False

        if hashtag:
            for tags, score in hashtag:
                if tags not in punctuation and tags not in sw and len(
                        tags) > 2:
                    tag = f"#{tags}"
                    hashtags.append(tag)

    print(hashtags)

Exemplo n.º 22

0

Exibir arquivo

Arquivo: preprocess.py Projeto: poke19962008/Neural-Network-Projects

def extractDoc(ext):
    root = 'data'
    data = []
    for f in os.listdir(os.path.join(root, ext))[:5]:
        with open(os.path.join(root, ext, f), 'r') as sc:
            sc = clean(sc.read(), 'cpp')
            data.append(sc)
            print "[SUCCESS] Read", os.path.join(root, ext, f)


    vectorizer = TfidfVectorizer(tokenizer=tokenize, ngram_range=(1,2))
    X = vectorizer.fit_transform(data)
    del data

    features_by_gram = defaultdict(list)
    for f, w in zip(vectorizer.get_feature_names(), vectorizer.idf_):
        features_by_gram[len(f.split(' '))].append((f, w))
    top_n = 50

    for gram, features in features_by_gram.iteritems():
        top_features = sorted(features, key=lambda x: x[1], reverse=True)[:top_n]
        top_features = [f[0] for f in top_features]
        print '{}-gram top:'.format(gram), top_features

Exemplo n.º 23

0

Exibir arquivo

Arquivo: train.py Projeto: adijo/domain-specific-language-twitter-sentiment

def train():
    f = open(sys.argv[1], "r")
    corpus = []
    target = []
    failed = 0
    for line in f:
        try:
            pieces = line.split(DELIMITER)
            pieces = map(lambda x : x[1:len(x) - 1], pieces)
            corpus.append(clean(pieces[5]))
            target.append(int(pieces[0]))
        except:
            failed += 1
    X = VECTORIZER.fit_transform(corpus)

    # save vectorizer.
    with open('vectorizer.pkl', 'wb') as fid:
        cPickle.dump(VECTORIZER, fid)

    # save logistic regression model.
    logreg = linear_model.LogisticRegression(C=1e5)
    logreg.fit(X, np.array(target))
    with open('logreg.pkl', 'wb') as fid:
        cPickle.dump(logreg, fid)

Exemplo n.º 24

0

Exibir arquivo

Arquivo: preprocessor.py Projeto: thrilliams/Lyric-Classifier

def preprocess():
    vec = CountVectorizer()

    data = clean('lyrics.txt')
    vec.fit_transform([i[1] for i in data])
    shuffle(data)
    data, d2 = tts(data, test_size=0.1)
    data = chunkify(data, 10)

    lyrics = []
    for d in data:
        temp = [[], []]
        for item in d:
            temp[0].append(item[1])
            temp[1].append(item[0])
        temp[0] = vec.transform(temp[0]).toarray().tolist()
        lyrics.append(temp)
    test = [[], []]
    for item in d2:
        test[0].append(item[1])
        test[1].append(item[0])
    test[0] = vec.transform(test[0]).toarray().tolist()

    return [lyrics, test]

Exemplo n.º 25

0

Exibir arquivo

def main():
    config = parse_config(sys.argv)
    results = do_search(config)

    if len(results) == 0:
        print('No results found.')
        sys.exit(1)
    elif len(results) == 1:
        result = results[0]
    else:
        result = do_disambiguate(results)

    filename = do_download(result)
    print('Saved pronuncuation to ./{}'.format(filename))

    if config['clean']:
        print('Cleaning..')

        username = result['standard_pronunciation']['username']
        profile = cleaner.find_noise_profile(username)

        if profile is None:
            print('No noise profile exists for {}. '
                  'We will try to create one.'.format(username))

        cleaned_filename, new_profile = cleaner.clean(filename,
                                                      username,
                                                      noise_profile=profile)

        if profile is None:
            if new_profile is None:
                print('Noise profile creation aborted.')
            else:
                print('Saved new profile to {}'.format(new_profile))

        print('Cleaned pronunciation saved to ./{}'.format(cleaned_filename))

Exemplo n.º 26

0

Exibir arquivo

def make(sourcefile, modulename, target=False):
  ''' using gccxml directly distort ctypeslib performances
  but on some libraries, we don't have a choice.
  '''
  if not os.access(sourcefile, os.F_OK):
    raise IOError(sourcefile)
  #sourcefile
  basename = os.path.basename(sourcefile)
  preprocessed = "%s.c"%(modulename)
  cleaned = "%s_clean.c"%(modulename)
  xml = "%s.xml"%(modulename)
  pyfinal = "%s.py"%(modulename)
  if target:
    gen2(sourcefile, modulename, target)
    log.info('PYFINAL - OK')
  else:
    if not os.access(pyfinal, os.F_OK):
      if not os.access(cleaned, os.F_OK):
        if not os.access(preprocessed, os.F_OK):
          # preprocess the file
          if preprocess.process(sourcefile, preprocessed) > 0:
            return
        log.info('PREPROCESS - OK')
        # clean it
        if cleaner.clean(preprocessed, cleaned) > 0:
          return
      log.info('CLEAN - OK')
      # generate yfinal
      if gen(cleaned, modulename) > 0:
        return
    log.info('PYFINAL - OK')
  __import__(modulename)
  import inspect
  nbClass = len(inspect.getmembers(sys.modules[modulename], inspect.isclass))
  nbMembers = len(inspect.getmembers(sys.modules[modulename], inspect.isclass))
  log.info("module %s has %d members for %d class"%(modulename, nbMembers, nbClass))

Exemplo n.º 27

0

Exibir arquivo

# param 2 : output file path
####################
# Examples of use :
#
# To download 1000 questions with potential answers with java tag :
# python stackoverflow.py sdd java 3 javarawdump.json
#
# To clean data :
# python stackoverflow.py c javarawdump.json javacleaneddump.json
import sys
from cleaner import clean
from downloader import download

try:
    mode = sys.argv[1]
    if mode == 'sdd':  #simple data downloader
        tag = sys.argv[2]
        amount = int(sys.argv[3])
        output = sys.argv[4]
        download(tag, amount, output)
    elif mode == 'c':  #clean data
        input = sys.argv[2]
        output = sys.argv[3]
        clean(input, output)
    elif mode == 'add':
        print 'not supported yet'
    else:
        print 'Not supported work mode'
except:
    print 'Unspecified error occured ', sys.exc_info()[0], sys.exc_info()[1]

Exemplo n.º 28

0

Exibir arquivo

Arquivo: main.py Projeto: tbridges42/XMLCleaner

def main():
    creds = config.get_creds()
    sftp.download(creds.get("sftp_url"), creds.get("sftp_username"), creds.get("sftp_password"), creds.get("localpath"))
    cleaner.clean(creds.get("localpath"))
    merge.merge(creds.get("localpath"))
    scraper.scrape(creds)

Exemplo n.º 29

0

Exibir arquivo

Arquivo: summarizer.py Projeto: krishnasism/the-summary-project

def summarize(text):
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize, sent_tokenize

    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.cluster import KMeans

    import cleaner

    words = word_tokenize(text)
    sentences = sent_tokenize(text)

    stop_words = set(stopwords.words("english"))
    f = open("stopwords.txt")
    for stops in f.read().split():
        stop_words.add(stops)
    #print(sentences)

    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(sentences)

    true_k = 2
    model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
    model.fit(X)

    c1 = list()
    c2 = list()
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(true_k):
        #print ("Cluster %d:" % i)
        for ind in order_centroids[i, :10]:
            if (i == 0):
                c1.append(terms[ind])
            else:
                c2.append(terms[ind])

    #print("\n")
    #print("\n")
    #print("\n")

    #print("Cluster 1 :")
    #print(c1)
    #print("\n")
    #print("Cluster 2 : ")
    #print(c2)

    sentence_score = {}
    sc = 1.0
    for sentence in sentences:
        sc = 1.0
        for word in c1:
            #print("\n* "+ word)
            if word in sentence.lower():
                if sc <= 0:
                    sc = 0
                if sentence in sentence_score.keys():
                    sentence_score[sentence] += sc
                    sc = sc - 0.05
                    #print(sentence_score[sentence])
                else:
                    sentence_score[sentence] = sc
                    sc = sc - 0.05
                    #print(sentence_score[sentence])

    #print(sentence_score)

    sum_total = 0
    for sentence in sentences:
        if (sentence in sentence_score.keys()):
            sum_total += sentence_score[sentence]
    #print(sum_total)

    #print("Sum total : " + str(sum_total))

    average_score = int(sum_total / len(sentence_score))
    #print("Average = "+str(average_score))

    summary = ""

    #change the value have more fun!
    for sentence in sentences:
        #print(sentence)
        if sentence in sentence_score.keys(
        ) and sentence_score[sentence] > 2.6 * average_score:
            summary += "" + cleaner.clean(sentence) + "\n\n"

    print(summary)

    sentence_score2 = {}

    for sentence in sentences:
        sc = 1.0
        for word in c2:
            #print("\n* "+ word)
            if word in sentence.lower():
                if sc <= 0:
                    sc = 0
                if sentence in sentence_score2.keys():
                    sentence_score2[sentence] += sc
                    sc = sc - 0.05
                    #print(sentence_score[sentence])
                else:
                    sentence_score2[sentence] = sc
                    sc = sc - 0.05
                    #print(sentence_score[sentence])

    #print(sentence_score)

    sum_total = 0
    for sentence in sentences:
        if (sentence in sentence_score2.keys()):
            sum_total += sentence_score2[sentence]
    #print(sum_total)

    #print("Sum total : " + str(sum_total))

    average_score = int(sum_total / len(sentence_score2))
    #print("Average = "+str(average_score))

    summary = ""
    #change the value have more fun!
    for sentence in sentences:
        #print(sentence)
        if sentence in sentence_score2.keys(
        ) and sentence_score2[sentence] > 2.6 * average_score:
            summary += "" + cleaner.clean(sentence) + "\n\n\n"

    return (summary)

Exemplo n.º 30

0

Exibir arquivo

Arquivo: program.py Projeto: Alex0494/coolest-programming-language

import twitter_stream
import cleaner

if __name__ == '__main__':	

	#raw_data file path
	raw_data = 'data/twitter_data.json'

	while True:	

		##data retrival		
		twitter_stream.listen(raw_data)		
			
		##data cleaning
		european = cleaner.clean(raw_data)

		#insert
		cleaner.insert(european)

		##ereasing file content		
		f = open(raw_data, 'wr')
		f.close()

Exemplo n.º 31

0

Exibir arquivo

from qrGenerator import generate
from qrGUI import show
from cleaner import clean
import easygui

print("SENDER starts working")
print("Generate QR codes")

generate(easygui.fileopenbox(default="../img/*"))

print("QR codes ready")
print("Sending data")

show()
clean()

Exemplo n.º 32

0

Exibir arquivo

    best_params = grid_result.best_params_
    print('best_params are:', best_params)
    rfr = RandomForestRegressor(max_depth=best_params["max_depth"],
                                n_estimators=210,
                                random_state=False,
                                verbose=False)
    # Perform K-Fold CV
    rfr.fit(X, y)
    #scores = cross_val_score(rfr, X, y, cv=10, scoring='neg_mean_absolute_error')

    #return [scores,rfr]
    return rfr


s = 'DataSets/Train.csv'
df = clean(s)
print(df.head())

Y = df['traffic_volume'].values
X = df.drop(['date_time', 'traffic_volume', 'dew_point'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=42)

rfr = rfr_model(X, Y)
#print('scores are:',scores)
y_pred = rfr.predict(x_test)
err = mean_squared_error(y_test, y_pred)
err_log = mean_squared_log_error(y_test, y_pred)

Exemplo n.º 33

0

Exibir arquivo

data = pd.read_csv(path_training_data,
                   header=None,
                   names=cols,
                   encoding='latin-1')

print('_______________________________________________________')
#print(len(data))
#print(data.head())

data.drop(['id', 'date', 'query_string', 'user'], 1, inplace=True)

X = data['text']
print(type(X))
print(X[:5])
y = data['sentiment']
clean_training_data = cleaner.clean(X)
#print('_______________________________________________________')
print(len(X))
print(y[:250])
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.02,
                                                    random_state=0)

print(99999999999999999999)

vectorizer = CountVectorizer(ngram_range=(1, 2))

training_features = vectorizer.fit_transform(X_train)

print(8888888888888888888888)

Exemplo n.º 34

0

Exibir arquivo

Arquivo: main.py Projeto: jckdm/CSS-parser

def main():
    # parse the files
    c = parse_css()
    h = parse_html()
    unused, undefined, fileNames, css = {}, {}, [], []
    results, fileCount = '', 0

    # identify UNUSED classes
    for cla, num in c[0][0].items():
        x = cla.split()
        # no psuedoclasses allowed
        if ':' not in x[0]:
            css.append(x[0])
        if x[0] not in h[0][0][0]:
            unused[cla] = num

    # identify UNUSED IDs
    for ID, num in c[1][0].items():
        y = ID.split()
        # no psuedoclasses allowed
        if ':' not in y[0]:
            css.append(y[0])
        if y[0] not in h[0][1][0]:
            unused[ID] = num

    i = f'Identified {c[0][1]} unique classes and {c[1][1]} unique IDs.\n'
    print('\n' + i)
    results += i

    # identify UNDEFINED classes and IDs
    for d in h[0]:
        for dd in d:
            for rule, file in dd.items():
                if rule not in css:
                    undefined[rule] = file
                    pre = 'ID:   ' if rule[0] == '#' else 'class:'
                    o = f'Undefined {pre} {rule} : {file}'
                    print(o)
                    results += '\n' + o
    print()
    results += '\n'

    # copy to allow deleting
    final = dict(unused)

    # identify pseudoclasses
    for rule, num in unused.items():
        z = rule.split()
        r, fn = z[0], z[2]
        # get filenames
        if fn not in fileNames:
            fileNames.append(fn)
            fileCount += 1
        # if pseudoclass
        if ':' in r:
            rr = r.split(':')[0]
            # if rule exists and isn't unused
            if rr in css and rr + ' : ' + fn not in unused:
                del final[rule]
                continue
        o = ''
        if z[0][0] == '.':
            o = f'Unused class:    {rule}{num}'
        elif z[0][0] == '#':
            o = f'Unused ID:       {rule}{num}'
        print(o)
        results += '\n' + o

    # update dict with full css filepaths
    fullFilePairs = updateFilePaths(fileNames, h[1])

    # predefined in case file is already clean
    q, qq = 'no', 'no'
    if not final:
        o = 'No unused classes nor IDs!'
        print(o)
        results += i
    if final:
        # may i clean?
        q = input(
            '\nMay I remove these unused rules and output new .css files? (yes/no): '
        )
        if q.lower() in ('yes', 'y'):
            clean(final, fileNames, fileCount)
    if not undefined:
        o = 'No undefined classes nor IDs!'
        print(o)
        results += i
    if undefined:
        # may i define?
        qq = input('May I add definitions for undefined rules? (yes/no): ')
        if qq.lower() in ('yes', 'y'):
            define(undefined, fullFilePairs)

    # no cleaning, but maybe a humble .txt file?
    if q.lower() in ('no', 'n') and qq.lower() in ('no', 'n'):
        qqq = input(
            'Would you instead like a .txt file with your results? (yes/no): ')
        if qqq.lower() in ('yes', 'y'):
            with open('results.txt', 'w') as f:
                f.write(results)
                print('Wrote results.txt')
        elif qqq.lower() in ('no', 'n'):
            exit('Thank you.')
        else:
            exit('Invalid response.')
    else:
        exit('Thank you.')

Exemplo n.º 35

0

Exibir arquivo

        if s.query(News).filter(News.title == i['title'], News.author == i['author']).first():
            break
        else:
            s.add(News(**i))
    s.commit()
    redirect("/news")

@route('/recommendations')
def recommendations():
    s = session()
    rows = s.query(News).filter(News.label == None).all()
    classified_news = []
    for i in rows:
        prediction = model.predict(clean(i.title))
        for j in range(len(prediction)):
            if prediction[j] == 'good':
                classified_news.append(i)
            else:
                break
    return template('news_recommendations', rows=classified_news)


if __name__ == '__main__':
    s = session()
    rows = s.query(News).filter(News.label != None).all()
    X_train = [clean(row.title) for row in rows]
    y_train = [row.label for row in rows]
    model = NaiveBayesClassifier()
    model.fit(X_train, y_train)
    
    run(host="localhost", port=8080)

Exemplo n.º 36

0

Exibir arquivo

    }
}

TYPES = ["dev", "test", "train"]

if __name__ == '__main__':
    IN_DIR = sys.argv[1]
    OUT_DIR = sys.argv[2]

    data = {}

    try:
        os.mkdir(IN_DIR)
    except OSError:
        pass

    for t in TYPES:
        for country, dataset in DATASET_FILES.items():
            reader = Reader(IN_DIR + '/' + dataset[t])
            data[t] = list(zip(reader.y(), reader.X()))

        with open('{}/{}.tsv'.format(OUT_DIR, t), 'w') as out:
            csv_out = csv.writer(out, delimiter='\t')
            for row in data[t]:
                sentiment, text = row
                text = clean(text)
                if sentiment:
                    csv_out.writerow([sentiment, text])
                else:
                    csv_out.writerow([text])

Exemplo n.º 37

0

Exibir arquivo

all = True
if all:
    ratio = 25  # training to test set
    train_corpus = main_corpus[:(ratio * len(main_corpus) // (ratio + 1))]
    train_corpus_target = main_corpus_target[:(ratio * len(main_corpus) //
                                               (ratio + 1))]
    test_corpus = main_corpus[(len(main_corpus) - (len(main_corpus) //
                                                   (ratio + 1))):]
    test_corpus_target = main_corpus_target[(len(main_corpus) -
                                             len(main_corpus) // (ratio + 1)):]
else:
    from cleaner import clean
    train_corpus = main_corpus
    train_corpus_target = main_corpus_target
    test_corpus = clean(["我叫李明，应聘销售经理。是王小姐让我下午两点半来面试的。", "你比他大一岁"])
    test_corpus_target = [5, 2]

# size of datasets
train_corpus_size_mb = size_mb(train_corpus)
test_corpus_size_mb = size_mb(test_corpus)

print("%d documents - %0.3fMB (training set)" %
      (len(train_corpus_target), train_corpus_size_mb))
print("%d documents - %0.3fMB (test set)" %
      (len(test_corpus_target), test_corpus_size_mb))
print("%d categories" % len(my_categories))
print()

print(
    "Extracting features from the training data using a sparse vectorizer...")