Exemplo n.º 1
0
class TopicModeler:
    def __init__(self, **kwargs):
        '''
        kwargs::

        class gensim.models.ldamodel.LdaModel(corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf=None, minimum_phi_value=0.01, per_word_topics=False, callbacks=None, dtype=<class 'numpy.float32'>)¶
        '''
        self.cleaner = Cleaner()
        self.lda_model = None
        self.lda_kwargs = kwargs
        self.dictionary = None

    def update(self, docs):
        cleaned = [list(self.cleaner.clean(doc)) for doc in docs]
        self.dictionary = corpora.Dictionary(cleaned)
        corpus = [self.dictionary.doc2bow(text) for text in cleaned]

        if self.lda_model is None:
            self.lda_model = models.ldamodel.LdaModel(corpus,
                                                      id2word=self.dictionary,
                                                      **self.lda_kwargs)
        else:
            self.lda_model.update(corpus, id2word=self.dictionary)

    def classify(self, doc):
        bow = self.dictionary.doc2bow(list(self.cleaner.clean(doc)))
        topic = max(self.lda_model.get_document_topics(bow),
                    key=lambda x: x[1])[0]
        return self.lda_model.show_topic(topic)

    def print_topics(self):
        print(self.lda_model.print_topics(num_topics=10, num_words=3))
Exemplo n.º 2
0
def clean_invalid_glyphs_and_remove_hinting(fontfile, hinting, output):
  whitespace_and_ignorable_list = get_whitespace_and_ignorable_list()
  cleaner = Cleaner(fontfile, hinting, whitespace_and_ignorable_list)
  cleaner.clean()
  # Flatten cmap format 4 (no idRangeOffset/glyphIdArray) so it is a simple 
  # subset of format 12.
  change_method(_c_m_a_p.cmap_format_4,_cmap_format_4_compile, 'compile')
  cleaner.save(output)
  cleaner.close()
Exemplo n.º 3
0
 def build(self):
     '''build the database'''
     reddit = Reddit()
     cleaner = Cleaner()
     for subreddit in reddit.get_subreddits():
         for post in reddit.get_posts(subreddit):
             self.database.insert(cleaner.clean(post))
             for comment in reddit.get_comments(post):
                 self.database.insert(cleaner.clean(comment))
Exemplo n.º 4
0
def clean_invalid_glyphs_and_remove_hinting(fontfile, hinting, output, verbose):
    whitespace_and_ignorable_list = get_whitespace_and_ignorable_list()
    cleaner = Cleaner(fontfile, hinting, whitespace_and_ignorable_list)
    cleaner.clean(verbose)
    # Flatten cmap format 4 (no idRangeOffset/glyphIdArray) so it is a simple
    # subset of format 12.
    # do we still what this?
    change_method(_c_m_a_p.cmap_format_4, _cmap_format_4_compile, "compile")
    old_12_or_13_compile = change_method(_c_m_a_p.cmap_format_12_or_13, _cmap_format_12_or_13_compile, "compile")
    cleaner.save(output)
    cleaner.close()
    change_method(_c_m_a_p.cmap_format_12_or_13, old_12_or_13_compile, "compile")
Exemplo n.º 5
0
 def perform_coverage_delete(self, files):
     # perform coverage delete
     cleaner = Cleaner('pass # pragma: no cover\n')
     for filename in files.keys():
         content = self.read(filename)
         content = cleaner.clean(content, files[filename])
         self.write(filename, content)
Exemplo n.º 6
0
def clean_invalid_glyphs_and_remove_hinting(fontfile, hinting, output,
                                            verbose):
    whitespace_and_ignorable_list = get_whitespace_and_ignorable_list()
    cleaner = Cleaner(fontfile, hinting, whitespace_and_ignorable_list)
    cleaner.clean(verbose)
    # Flatten cmap format 4 (no idRangeOffset/glyphIdArray) so it is a simple
    # subset of format 12.
    # do we still what this?
    change_method(_c_m_a_p.cmap_format_4, _cmap_format_4_compile, 'compile')
    old_12_or_13_compile = change_method(_c_m_a_p.cmap_format_12_or_13,
                                         _cmap_format_12_or_13_compile,
                                         'compile')
    cleaner.save(output)
    cleaner.close()
    change_method(_c_m_a_p.cmap_format_12_or_13, old_12_or_13_compile,
                  'compile')
Exemplo n.º 7
0
def main(endpoint_models: list):
    """ Loop over endpoint models, search file directory for endpoints
        and compare to a standard endpoint config.
    """
    audit_list = list()
    for model in tqdm(endpoint_models, desc="Looping over endpoint models..."):
        # first get the gold standard config file
        standard_config_file = get_standard_config(model)

        # now open that standard file
        with open(standard_config_file, 'r') as standard_config:
            standard_config_json = json.load(standard_config)

        audit = Parser(standard_config_json)

        # gather endpoint filenames
        endpoint_config_files = gather_endpoints(model)

        for endpoint in tqdm(endpoint_config_files,
                             desc="Looping over endpoint config files..."):
            with open(endpoint, 'r') as endpoint_file:
                endpoint_json = json.load(endpoint_file)

            config_diff = audit.compare(endpoint_json, endpoint.name)

            cleaner = Cleaner(config_diff)
            cleaned = cleaner.clean()
            audit_list.append({f"{model}": cleaned})
    return audit_list
Exemplo n.º 8
0
 def clean_corpus(self, sentences):
     wordlist = []
     for sent in sentences:
         cleaned = Cleaner.clean(sent)
         words = cleaned.split()
         wordlist.extend(words)
     corpus = " ".join(wordlist)
     return corpus
Exemplo n.º 9
0
 def test_to_submission_format(self):
     e = Cleaner('./tests/data/models/ae3_213750_model.pkl')
     img = '../data/test/10.png'
     img, id = e.clean(img)
     csv = e.to_submission_format(img, id)
     row = csv[300].split(',')
     self.assertEqual(row[0], '%s_%d_%d' % (id, 1, 301))
     self.assertTrue(float(row[1]) <= 1.0)
Exemplo n.º 10
0
def do(task):
    logging.debug("Start doing task: %s" % task)
    cleaner = Cleaner()
    try:
        return cleaner.clean(task)
    except:
        traceback.print_exc(file=sys.stderr)
        logging.critical('Failed while cleaning for task %s' % (task['ID']))
        return False
Exemplo n.º 11
0
class CleanerTest(unittest.TestCase):
    def setUp(self):
        self.uut = Cleaner()

    def test_delte_nothing(self):
        lines = ['a', 'b', 'c', 'd', 'e']
        self.assertEqual(self.uut.clean(lines, []), lines)

    def test_delete_one_line(self):
        lines = ['a', 'b', 'c', 'd', 'e']
        self.assertEqual(self.uut.clean(lines, [0]), ['b', 'c', 'd', 'e'])

    def test_delete_multiple_line(self):
        lines = ['a', 'b', 'c', 'd', 'e']
        self.assertEqual(self.uut.clean(lines, [0, 2, 3]), ['b', 'e'])

    def test_delete_line_that_does_not_exist(self):
        lines = ['a', 'b', 'c', 'd', 'e']
        self.assertEqual(self.uut.clean(lines, [10, 20, 30]), lines)
Exemplo n.º 12
0
class LinuxBackup:
    def __init__(self):
        self.config_parser = ConfigParser()
        self.out_dir = os.path.join(os.path.dirname(__file__), "backups/")
        self.create_empty_dir()
        self.backup_name = "%s-%s.zip" % (os.getlogin(), time.strftime("%d-%m-%Y"))
        self.directorys = self.config_parser.directories_to_backup
        self.path = os.path.join(self.out_dir, self.backup_name)
        self.zip_creator = ZipCreator(self.path, self.directorys)
        self.drive_connector = DriveConnector(self.out_dir, self.config_parser)
        self.cleaner = Cleaner(self.out_dir, self.config_parser.get_clean_time())

    def create_empty_dir(self):
        if not os.path.exists(self.out_dir):
            os.mkdir(self.out_dir)

    def backup(self):
        self.zip_creator.create()
        self.drive_connector.upload(self.path)
        self.cleaner.clean()
Exemplo n.º 13
0
class CleanerWithReplacementTest(unittest.TestCase):
    def setUp(self):
        self.uut = Cleaner('pass')

    def test_delte_nothing(self):
        lines = ['a', 'b', 'c', 'd', 'e']
        self.assertEqual(self.uut.clean(lines, []), lines)

    def test_replace_one_linestuff(self):
        lines = ['a', 'b', 'c', 'd', 'e']
        self.assertEqual(self.uut.clean(lines, [0]),
                         ['pass', 'b', 'c', 'd', 'e'])

    def test_replacing_respects_intentation(self):
        lines = ['    a', 'b', '  c', 'd', 'e']
        self.assertEqual(self.uut.clean(lines, [0, 2]),
                         ['    pass', 'b', '  pass', 'd', 'e'])

    def test_collapses_two_pass_together(self):
        lines = ['a', 'b', 'c', 'd', 'e']
        self.assertEqual(self.uut.clean(lines, [0, 1]),
                         ['pass', 'c', 'd', 'e'])
Exemplo n.º 14
0
 def clean_comment(self, comment):
     return Cleaner.clean(comment)  # Move this to Self
Exemplo n.º 15
0
from os import environ as environment
import argparse, yaml
import logging
from cleaner import Cleaner

parser = argparse.ArgumentParser()
parser.add_argument("--path", help="path to run cleaner on", type=str)
args = parser.parse_args()

# logging.basicConfig(level=logging.DEBUG)


with open("config.yml") as sets:
    config = yaml.load(sets)

path = args.path
if not path:
	path = config["cleaner"]["general_pattern"]

cleaner = Cleaner(config["cleaner"])

print "Cleaning path: " + str(path)
cleaner.clean(path, True)
Exemplo n.º 16
0
def main(p):
    start = time.time()

    # 选择文件名以'json.gz'结尾的记录
    file_name_list = filter(lambda x: x.endswith('json.gz'), os.listdir(p))

    # TODO 添加文件是否是24个的判断(glob模块)

    for file_name in file_name_list:
        with open(os.path.join(p, file_name), 'r') as f:
            raw_json_file = gzip.GzipFile(fileobj=f)

            record_cleaner = Cleaner()
            record_grouper = Grouper(db)
            record_normalizer = Normalizer(db)
            mongo_helper = MongoHelper(db)
            counter = ActorCounter()
            evaluater = Evaluater()

            # 数据清洗
            record_cleaner.set_dirty_data(raw_json_file)
            record_cleaner.clean()
            clean_record = record_cleaner.get_clean_data()
            log.log('clean record %s' % len(clean_record))
            # 数据处理

            # 分组
            record_grouper.set_records(clean_record)
            record_grouper.group()
            record_actor_exist = record_grouper.get_group_1()
            record_actor_new = record_grouper.get_group_2()
            log.log('record_actor_exist: %s' % len(record_actor_exist))
            log.log('record_actor_new: %s' % len(record_actor_new))

            # 处理记录的actor已存在的记录
            log.log('Begin processing actor-exist records...')
            # 只需要删掉记录的actor_attrs即可
            for record in record_actor_exist:
                del record['actor_attributes']
            log.log('Finished.')

            # 处理记录的actor不存在的记录
            record_normalizer.set_records(record_actor_new)
            record_normalizer.normalize()
            record_actor_new = record_normalizer.get_record_actor_new()
            new_actors = record_normalizer.get_new_actors()

            # 把本地的今日新增的Actor更新到数据库
            actors = new_actors.values()
            mongo_helper.insert_new_actors(actors)

            # 对新增的Actor, 改变Redis中相应的计数
            counter.count_actor_list(actors)

            # 计算每条记录的val
            evaluater.set_records(record_actor_exist)
            evaluater.evaluate()
            val_actor_exist = evaluater.get_val_cache()

            evaluater.set_records(record_actor_new)
            evaluater.evaluate()
            val_actor_new = evaluater.get_val_cache()

            # 将记录插入数据库
            mongo_helper.insert_new_reocrds(record_actor_new)
            mongo_helper.insert_new_reocrds(record_actor_exist)

            # 将今日用户新增的val更新到数据库
            mongo_helper.update_val(val_actor_new)
            mongo_helper.update_val(val_actor_exist)

            record_cleaner.free_mem()
            del record_cleaner
            del record_grouper
            del record_normalizer
            del mongo_helper
            del counter
            del evaluater

    # 生成CSV文件
    util.grcount2csv()

    end = time.time()
    log.log('total: %s s' % (end - start))
Exemplo n.º 17
0
def main():
    config = create_config()
    args = parse_args()
    cleaner = Cleaner(config, args)
    cleaner.clean()
Exemplo n.º 18
0
import time

t1 = time.time()

fd = FeedDownloader("http://mybroadband.co.za/news/feed", "My Broadband")
wh = WebHelper()
cleaner = Cleaner()

articles = fd.parse()

print time.time() - t1, "setup complete"
t1 = time.time()

wh.get_html_threaded(articles)
print time.time() - t1, "threaded download complete"

t1 = time.time()
for a in articles:
    a.html = wh.attempt_get_html(a.url)
print time.time() - t1, "non threaded download complete"

t1 = time.time()
for a in articles:
    if a.html:
        a.plaintext = cleaner.clean(a.html)
        print(a.plaintext[:300])
        print('')
print time.time() - t1, "cleaning complete"

Exemplo n.º 19
0
    def vocabulary_analysis(self, cleaned_corpus, text):
        len_wrds = len(text.split())
        emos = []

        balance = {}
        for all_keys in self.meta_data.keys():
            balance[all_keys] = []

        text = text.lower()
        words = text.split()
        covered = []
        for word in self.meta_keywords:
            if " " + word + " " in " " + text + " ":
                for tag in self.meta_keywords[word]:
                    if word not in balance[tag]:
                        actual_count = self.count_occurrences(word, text)
                        for i in range(actual_count):
                            balance[tag].append(word)

                            if word in self.both:
                                emos.append(word)

                        covered.append(word)

            else:
                cln_wrd = Cleaner.clean(word)
                if cln_wrd != "" and cleaned_corpus != "" and " " + cln_wrd + " " in " " + cleaned_corpus + " ":
                    for tag in self.meta_keywords[word]:
                        if word not in balance[tag] and word.lower().strip(
                        ) in text.lower().strip():
                            actual_count = self.count_occurrences(
                                cln_wrd, cleaned_corpus)
                            for i in range(actual_count):
                                balance[tag].append(word)

                                if word in self.both:
                                    emos.append(word)

                            covered.append(word)

        visited = []
        for key, values in balance.iteritems():
            vals = Counter(values)
            vals = sorted(vals.items(),
                          key=operator.itemgetter(1),
                          reverse=True)

            for ww in vals:
                if ww[0] not in visited:
                    visited.append(ww[0])

            cnts = round((float(sum([x[1] for x in vals]) * 100) / len_wrds),
                         2)

            balance[key] = {}
            value_words = [
                "".join(a for a in x[0] if a not in punctuation) for x in vals
            ]
            balance[key]['values'] = list(set(value_words))
            balance[key]['percentage'] = cnts
            balance[key][key + '_count'] = sum([x[1] for x in vals])

        word_balance_grade, score = self.getWordBalanceGrade(balance)
        balance['grade'] = word_balance_grade
        balance['score'] = score

        other = []
        cln_words = cleaned_corpus.split()
        for wd in words:
            wd = "".join(a for a in wd if a not in punctuation)
            if wd and wd not in covered:
                other.append(wd)
        # print other
        other_per = float(len(other)) * 100 / len_wrds

        # other_per = 100 - balance['common']['percentage'] - balance['uncommon']['percentage'] - balance['emotional']['percentage'] - balance['power']['percentage']
        # if other_per < 0:
        # other_per = 0

        balance['other'] = {}
        balance['other']['percentage'] = other_per
        balance['other']['values'] = other

        balance['common']['common_count'] = balance['common']['percentage']

        # if emos:
        # 	balance['emotional']['values'].extend(emos)
        # 	balance['emotional']['emotional_count'] += len(emos)

        return balance
Exemplo n.º 20
0
    def extract(raw_txt, logger):

        c = Cleaner()
        cleaned_text_list = c.clean(raw_txt)

        logger.info('Done cleaning')
        logger.debug(len(cleaned_text_list))
        logger.debug(cleaned_text_list)

        matrix_builder = MatrixBuilder()
        matrix = matrix_builder.build_sim_matrix(cleaned_text_list, logger)

        logger.info('Done building sim matrix')
        logger.debug('Dimensions: {}'.format(matrix.shape))
        logger.debug(matrix)

        g = Grapher()
        pageranks = g.graph(matrix)

        logger.info('Generated graph and got pageranks')
        logger.debug(pageranks)

        total_doc_size = len(cleaned_text_list)
        if total_doc_size in range(0, 300):
            summary_length = int(0.4 * total_doc_size)
        elif total_doc_size in range(301, 800):
            summary_length = int(0.2 * total_doc_size)
        elif total_doc_size in range(801, 1500):
            summary_length = int(0.1 * total_doc_size)
        else:
            summary_length = int(0.05 * total_doc_size)

        top_ranked = nlargest(summary_length, pageranks, key=pageranks.get)
        top_ranked.sort()

        cl = Cluster()
        top_ranked = cl.splitIntoParagraph(top_ranked, 7.5)

        logger.debug(top_ranked)
        result = ''
        for paragraph in top_ranked:
            for key in paragraph:
                top_ranked_sentence = cleaned_text_list[key]
                result += '{}. '.format(top_ranked_sentence)
            result += '\n\n'

        try:
            del c
            del cleaned_text_list
            del matrix_builder
            del matrix
            del g
            del pageranks
            del total_doc_size
            del summary_length
            del top_ranked
            del cl
            del raw_txt
        except:
            pass

        return result
Exemplo n.º 21
0
def main():
    parser = OptionParser(prog="reaper",
                          version="0.1.0",
                          usage="%prog [options] <path to folder> "+
                          "[<path to folder...>]",
                          description="PyReaper is a small tool that detects " + 
                          "duplicated files by hashing them and then deletes " + 
                          "these duplicated files leaving just one of them", 
                          epilog="CAUTION: handle with EXTREME CARE, " + 
                          "use -n option first if you are not sure of " + 
                          "what are you doing, this thing deletes stuff!!!")
    parser.add_option("-n",
                      "--no-action",
                      dest="noaction",
                      action="store_true",
                      help="does not executes any file action")
    parser.add_option("-d",
                      "--delete",
                      dest="delete",
                      action="store_true",
                      help="delete every duplicated file")
    parser.add_option("-m", 
                      "--move-to",
                      dest="moveto",
                      metavar="DIR",
                      help='Moves duplicated files instead of deleting them')
    parser.add_option("-p",
                      "--print-rm-commands",
                      dest="rmcommands",
                      action="store_true",
                      help="skips delete process and prints a set of \"rm\" " + 
                      "commands so you can delete the duplicate files yourself")
    parser.add_option("-i",
                      "--interactive",
                      dest="interactive",
                      action="store_true",
                      help="interactive mode, will ask for each duplicate. " + 
                      "By default it deletes every duplicate found but " + 
                      "the first one")
    parser.add_option("-y",
                      "--dont-ask-confirmation",
                      dest="noconfirmation",
                      action="store_true",
                      help="skips confirmation question. ")
    parser.add_option("-s",
                      "--store-hashes",
                      dest="storehash",
                      action="store_true",
                      help="store and keep calculated hashes in .digest hidden files ")
    parser.add_option("-t",
                      "--delete-empty-trees",
                      dest="deletedirs",
                      action="store_true",
                      help="deletes empty trees when finishes")
    parser.add_option("-e", 
                      "--ext", 
                      dest="extension", 
                      action="store",
                      help="only digests files with the given extension" )
    parser.add_option("-v",
                      "--verbose",
                      dest="verbose",
                      action="store_true",
                      help="outputs much more information during process " + 
                      "(sometimes even too much)")
    parser.add_option("",
                      "--ignore-stored-hashes",
                      dest="ignorehashes",
                      action="store_true",
                      help="ignores stored calculated hashes in .digest " + 
                      "hidden files, this means every hash will be " + 
                      "recalculated")
    
    (options, args) = parser.parse_args()

    if not args:
        exit_with_error('', parser)
    
    br = Walker(options.extension, \
			options.storehash, \
			options.verbose, \
			options.ignorehashes)

    action = None
    moveto = None
    rmcommands = False
    
    if options.noaction:
        action = 'n'
        
    elif options.moveto:
        action = 'm'
        moveto = options.moveto
        
        if not moveto:
            exit_with_error('No "move to" target provided', parser)
            
        elif not os.path.exists(moveto):
            exit_with_error('Path %s does not exists' % moveto, parser)
            
        elif not os.path.isdir(moveto):
            exit_with_error('Path %s is not a directory' % moveto, parser)
        
    elif options.delete:
        action = 'd'
        rmcommands = options.rmcommands
        
        
    if action is None:
        exit_with_error('No action selected', parser)

    for path in args:
        if not os.path.exists(path):
            exit_with_error("path {0} does not exists".format(path), parser)
        br.digest(path)
    
    duplicates = br.collisions()
    clean = False
    
    if duplicates:

        print "Duplicates found, cleaning..."
        c = Cleaner(
                    duplicates,
                    options.interactive,
                    options.verbose,
                    action,
                    rmcommands,
                    options.noconfirmation,
                    moveto)

        clean = c.clean()
        
    else:
        print "No duplicates found"
        
    if not options.storehash:
        print "Deleting digest files..."
        c = Cleaner(verbose = options.verbose)
        c.delete(br.digestFiles(), -1, True)
        
        
    if options.deletedirs:
        c = Cleaner(verbose = options.verbose)
        for path in args:
            empty_dirs = br.findEmptyDirs(path)
            for dir in empty_dirs:
                if options.rmcommands or options.noaction:
                    print "Keeping empty tree {0}".format(dir)
                else:
                    c.deleteDir(dir)
        
    if clean:
        sys.exit(0)
    else:
        sys.exit(1)
Exemplo n.º 22
0
from os import environ as environment
import argparse, yaml
import logging
from cleaner import Cleaner

parser = argparse.ArgumentParser()
parser.add_argument("--path", help="path to run cleaner on", type=str)
args = parser.parse_args()

# logging.basicConfig(level=logging.DEBUG)

with open("config.yml") as sets:
    config = yaml.load(sets)

path = args.path
if not path:
    path = config["cleaner"]["general_pattern"]

cleaner = Cleaner(config["cleaner"])

print "Cleaning path: " + str(path)
cleaner.clean(path, True)
Exemplo n.º 23
0
import json
from finder_interface import FinderInterface
from cleaner import Cleaner
from writer_interface import WriterInterface
from cli_messages import Message
''' Get the config object from config/config.txt '''
json_file = open("config/config.txt", "r")
config = json.load(json_file)
''' Get the word from the wikipedia page, pass term from config '''
wiki_finder = FinderInterface(config["wiki_term"])
word = wiki_finder.get_word()
''' Clean the word of special characters and signs '''
cleaner = Cleaner()
clean_word = cleaner.clean(word)
''' Save the word and the validating word in the file / database '''
writer = WriterInterface()
writer.save_word(clean_word)
''' Start the game here '''
echo = input("Are you ready to play? ")
print(" ")
name = input("What is your name darling? ")
message = Message(name)

if echo == "yes":
    # Choose the level
    level = input("What is your LEVEL (easy/medium/hard)? ")
    if level == "easy":
        max_turns = len(clean_word) + config["levels"]["easy"]
    elif level == "medium":
        max_turns = len(clean_word) + config["levels"]["medium"]
    elif level == "hard":
def collect():
    storage = Storage('constituents.db')
    try:
        result = storage.run_query('Select * from constituents')
    except:
        ## If constituents under DAX isnt present fetch it by calling another script.
        collect_constituent()
        result = storage.run_query('Select * from constituents')

    df = pd.DataFrame(result)
    df.columns = ['constituent_name','wkn']
    url = 'https://www.boerse-frankfurt.de/equity/{}?lang=en'
    options = webdriver.ChromeOptions()
    # options.add_argument('--headless')
    driver = webdriver.Chrome('../chromedriver',options = options)
    final_data = {}

    try:
        for index,row in df.iterrows():
            ## To loop over each constituent page.
            final_data = {}
            driver.get(url.format(row['wkn']))
            sleep(3)
            tabs = driver.find_elements_by_xpath('//button[contains(@class,"data-menue-button")]')
            for tab in tabs:
                if tab.text in ['Charts','News','Company Details']:
                    continue
                tab.click()
                sleep(4)
                tables = driver.find_elements_by_xpath('//table')
                table_names = []
                for table in tables:
                    try:
                        table_name = table.find_element_by_xpath('./..//preceding-sibling::h2[contains(@class,"widget-table-headline")]').text
                    except:
                        table_name = ''
                    if table_name.find(row['constituent_name']) != -1:
                        table_name = table_name[ :table_name.find(row['constituent_name'])].strip()
                    table_names.append(table_name)
                data = pd.read_html(driver.page_source)
                for each_df,table_name in zip(data,table_names):
                    if not table_name:
                        continue
                    final_data[table_name] = each_df
            ## Call cleaner to cleanse and format data.
            cleaner = Cleaner(final_data)
            final_data = cleaner.clean()
            collection_date = datetime.datetime.now().strftime('%d/%m/%y')
            for table in final_data:
                ## Get the dataframe and filter out rows that are already present in database.
                latest_date = storage.get_date(table,row['wkn'])
                if not latest_date or collection_date > latest_date:
                    print('Collecting data for {} for constituent {}({})'.format(table,row['constituent_name'],row['wkn']))

                    try:
                        final_data[table]['collection_date'] = collection_date
                        final_data[table]['constituent_name'] = row['constituent_name']
                        final_data[table]['wkn'] = row['wkn']
                        ## Insert the data to database.
                        storage.insert_bulk(table,final_data[table])
                    except Exception as e:
                        print(e)
                else:
                    print('Already collected for {} for constituent {}({})'.format(table,row['constituent_name'],row['wkn']))
            break
    except Exception as e:
        print(e)
    finally:
        driver.quit()
Exemplo n.º 25
0
def main(p):
    start = time.time()

    # 选择文件名以'json.gz'结尾的记录
    file_name_list = filter(lambda x: x.endswith('json.gz'), os.listdir(p))

    # TODO 添加文件是否是24个的判断(glob模块)

    for file_name in file_name_list:
        with open(os.path.join(p, file_name), 'r') as f:
            raw_json_file = gzip.GzipFile(fileobj=f)

            record_cleaner = Cleaner()
            record_grouper = Grouper(db)
            record_normalizer = Normalizer(db)
            mongo_helper = MongoHelper(db)
            counter = ActorCounter()
            evaluater = Evaluater()

            # 数据清洗
            record_cleaner.set_dirty_data(raw_json_file)
            record_cleaner.clean()
            clean_record = record_cleaner.get_clean_data()
            log.log('clean record %s' % len(clean_record))
            # 数据处理

            # 分组
            record_grouper.set_records(clean_record)
            record_grouper.group()
            record_actor_exist = record_grouper.get_group_1()
            record_actor_new= record_grouper.get_group_2()
            log.log('record_actor_exist: %s' % len(record_actor_exist))
            log.log('record_actor_new: %s' % len(record_actor_new))


            # 处理记录的actor已存在的记录
            log.log('Begin processing actor-exist records...')
            # 只需要删掉记录的actor_attrs即可
            for record in record_actor_exist:
                del record['actor_attributes']
            log.log('Finished.')


            # 处理记录的actor不存在的记录
            record_normalizer.set_records(record_actor_new)
            record_normalizer.normalize()
            record_actor_new = record_normalizer.get_record_actor_new()
            new_actors = record_normalizer.get_new_actors()

            # 把本地的今日新增的Actor更新到数据库
            actors = new_actors.values()
            mongo_helper.insert_new_actors(actors)

            # 对新增的Actor, 改变Redis中相应的计数
            counter.count_actor_list(actors)

            # 计算每条记录的val
            evaluater.set_records(record_actor_exist)
            evaluater.evaluate()
            val_actor_exist = evaluater.get_val_cache()

            evaluater.set_records(record_actor_new)
            evaluater.evaluate()
            val_actor_new = evaluater.get_val_cache()

            # 将记录插入数据库
            mongo_helper.insert_new_reocrds(record_actor_new)
            mongo_helper.insert_new_reocrds(record_actor_exist)

            # 将今日用户新增的val更新到数据库
            mongo_helper.update_val(val_actor_new)
            mongo_helper.update_val(val_actor_exist)

            record_cleaner.free_mem()
            del record_cleaner
            del record_grouper
            del record_normalizer
            del mongo_helper
            del counter
            del evaluater

    # 生成CSV文件
    util.grcount2csv()

    end = time.time()
    log.log('total: %s s' % (end - start))
Exemplo n.º 26
0
"""
from matplotlib import pyplot as plt
import numpy as np
import cv2
from loader import Loader
from cleaner import Cleaner
from features import Features
import sys
from params import Params

if __name__ == '__main__':
    params = Params()
    params.decode(sys.argv[1:])

    loader = Loader()
    loader.loadReferenceAndTentative()
    cleaner = Cleaner()
    target, imgReference = cleaner.clean(loader.imgGrayReference, "reference")
    imgTentative = cleaner.cleanWithExpectedCount(loader.imgGrayTentative,
                                                  "tentative", target)

    features = Features(imgReference, loader.imgColorReference, imgTentative,
                        loader.imgColorTentative)
    features.extractFeatures(params)

    print("Offset is:" + str(features.offset) + ", angle is: " +
          str(features.angle) + ", value is:" + str(features.value) +
          ", scale is:" + str(features.scaleFactor))
    sys.exit()
Exemplo n.º 27
0
def use_cleaner(html_snippet):
    cleaner = Cleaner(html_snippet)

    cleaner.clean()

    return str(cleaner)
Exemplo n.º 28
0
def clean_invalid_glyphs_and_remove_hinting(fontfile, hinting, output):
  whitespace_list = get_whitespace_list()
  cleaner = Cleaner(fontfile, hinting, whitespace_list)
  cleaner.clean()
  cleaner.save(output)
  cleaner.close()