示例#1
0
def avg_comments():
    width = 0.8
    data = {
        "bk55": 8.5272094345,
        "gorod55": 10.6726493011,
        "ngs55": 9.88509836752,
        "omskinform": 2.71181391085
    }

    sources = data.keys()

    data = [(source, round(avg, 2)) for source, avg in sorted(data.items(), key=lambda x: x[1])]
    sources_x = np.arange(len(data))
    numbers = [number for source, number in data]

    fig, ax = plt.subplots()
    ax.set_xlabel(u'Источник')
    ax.set_ylabel(u'Среднее количество комментариев')
    rects = ax.bar(sources_x, numbers, width)
    plt.xticks(sources_x + 0.4, sources)

    for rect, label in zip(rects, sources):
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2, height, height, ha='center', va='bottom')

    for ext in config.get("tex_image_format"):
        plt.savefig(config.get("tex_image_path") + "avg_comments." + ext, bbox_inches='tight', format=ext, dpi=1200)
    plt.show()
示例#2
0
def merge_dicts(in_dicts_list, out_dict):
    result_dict = []
    for dict in in_dicts_list:
        path_in = "{0}/Thesis/senti/dict/{1}".format(config.get("home_path"), dict)
        path_out = "{0}/Thesis/senti/dict/{1}".format(config.get("home_path"), out_dict)

        with open(path_in, 'r',) as dict_file:
            for num, line in enumerate(dict_file):
                if line.startswith(codecs.BOM_UTF8):
                    line = line[3:]
                line = line.strip()
                print num, line
                word = line.split('\t')[0].strip().decode("utf8")
                score = int(line.split('\t')[1])
                result_dict.append((word, score))

    '''Удаляем дубликаты'''
    seen = set()
    output = []
    for item in result_dict:
        if item[0] not in seen:
            output.append(item)
            seen.add(item[0])
    output.sort(key=lambda tup: tup[0])

    with open(path_out, 'w') as dict_out_file:
        for item in output:
            dict_out_file.write((item[0] + "\t" + str(item[1]) + "\n").encode('utf-8'))
示例#3
0
def mallet_plot():
    ## Строим график из данных Mallet
    topics = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
    perp = [514.1302184469, 525.0241590983, 534.4977260582, 538.5513127672, 541.8989344152, 541.0094559616, 538.4206752508, 535.3505225192, 532.7629698852, 533.9200808978, 540.3049166531, 531.4757292891, 538.0214941918, 546.4364835315, 538.0214941918, 540.9982061032, 532.1429352736, 535.5732148468, 536.4983783528, 539.0330790911]
    plt.plot(topics, perp, 'o-')
    plt.ylabel(u'Перплексия')
    plt.xlabel(u'Количество тем')
    plt.grid(True)
    plt.xlim(0, 105)
    for ext in config.get("tex_image_format"):
        plt.savefig(config.get("tex_image_path") + "perplexity_mallet." + ext,
                    format=ext, bbox_inches='tight', dpi=1200)
示例#4
0
def mongo_factory():
    """
    wraps getting a mongo client reading defaults from the settings
    """
    from settings import config

    hosts = config.get('mongo.hosts', 'localhost:27017')
    database = config.get('mongo.database', 'test')
    user = config.get('mongo.user', '')
    password = config.get('mongo.password', '')
    hosts = hosts.split(',')

    return mongo_client(hosts, database, user, password)
示例#5
0
def file_every_1000():
    for num, doc in enumerate(db.find()):
        if fmod(num, 1000) == 0:
            corpus = etree.Element("corpus")
            begin = num
            end = begin + 1000
            if end > db.find().count():
                end = db.find().count() - 1
            print "begin, end ", begin, end

            for doc in db.find()[begin:end]:
                document = etree.SubElement(corpus, "document")
                etree.SubElement(document, "title").text = doc["title"]
                etree.SubElement(document, "content").text = doc["content"]
                etree.SubElement(document, "date").text = str(doc["date"])
                etree.SubElement(document, "url").text = doc["url"]
                etree.SubElement(document, "source").text = doc["source"]
                comments = etree.SubElement(document, "comments", amount=str(len(doc["comments"])))

                for comment in doc["comments"]:
                    etree.SubElement(comments, "comment").text = comment

            xml_str = etree.tostring(corpus, pretty_print=True)

            with open("{0}/Thesis/omsk_media_{1}-{2}.xml".format(config.get("home_path"), begin, end), 'w') as outFile:
                outFile.write(xml_str)
示例#6
0
 def update_price_history(self, c, timestamp, price):
     monitored_at = datetime.datetime.fromtimestamp(timestamp).strftime('%H:%M:%S')
     if len(c['history_live']) >= config.get('history_live_window'):
         c['history_live'] = c['history_live'][1:]
     c['history_live'].append({'timestamp': monitored_at, 'price': price})
     self.store({'stock': c.get('stock'), 'timestamp': timestamp, 'price': price})
     return c
示例#7
0
文件: widgets.py 项目: 0x64746b/alot
 def __init__(self, tag):
     self.tag = tag
     self.translated = config.get('tag-translate', tag, fallback=tag)
     self.txt = urwid.Text(self.translated.encode('utf-8'), wrap='clip')
     normal = config.get_tagattr(tag)
     focus = config.get_tagattr(tag, focus=True)
     urwid.AttrMap.__init__(self, self.txt, normal, focus)
示例#8
0
文件: message.py 项目: jhcepas/alot
 def get_datestring(self):
     """returns formated datestring"""
     formatstring = config.get('general', 'timestamp_format')
     if formatstring:
         res = self._datetime.strftime(formatstring)
     else:
         res = helper.pretty_datetime(self._datetime)
     return res
示例#9
0
def expand_words(path_in, path_out):

    path_in = "{0}/Thesis/senti/dict/{1}/EmotionLookupTable.txt".format(config.get("home_path"), path_in)
    path_out = "{0}/Thesis/senti/dict/{1}/EmotionLookupTable.txt".format(config.get("home_path"), path_out)

    morph = pymorphy2.MorphAnalyzer()

    with open(path_in, "r") as infile, open(path_out, 'w') as outfile:
        #Очищаем файл
        outfile.truncate()

        for num, line in enumerate(infile):
            # http://en.wikipedia.org/wiki/Byte_order_mark#UTF-8
            nline = line
            if nline.startswith(codecs.BOM_UTF8):
                nline = nline[3:]
            nline = nline.strip()
            word = nline.split('\t')[0].strip().decode("utf8")
            score = int(nline.split('\t')[1])

            if "*" not in word:
                """ Берём слова """
                stemmed = stem_snowball(word)
                exclude_words = [u"зря"]

                if len(stemmed) < 4:
                    """Слишком короткие заменяем всеми словоформами"""
                    if word in exclude_words:
                        outfile.write(word.encode('utf-8') + "\t" + str(score).encode('utf-8') + "\n")
                    else:
                        for form in get_all_forms(word):
                            outfile.write(form.encode('utf-8') + "\t" + str(score).encode('utf-8') + "\n")

                if len(stemmed) >= 4:
                    """Для некоротких создаём правило на основе стемминга"""
                    if len(stemmed) < len(word):
                        """ Если стемминг имел место, заменяем усечённую часть * """
                        result_word = stemmed + "*"
                        outfile.write(result_word.encode('utf-8') + "\t" + str(score).encode('utf-8') + "\n")
                    else:
                        """ Иначе оставляем как было """
                        result_word = word  # == stemmed
                        outfile.write(result_word.encode('utf-8') + "\t" + str(score).encode('utf-8') + "\n")
            else:
                 outfile.write((word + "\t" + str(score) + "\n").encode('utf-8'))
示例#10
0
 def __new__(cls, *arguments, **keywords):
     '''
     The class that will be created will be a subclass in the config file:
     General -> exporter
     '''
     for subclass in Exporter.__subclasses__():
         class_str = subclass.__module__ + "." + subclass.__name__
         if class_str == config.get('General','exporter'):
             return super(cls, subclass).__new__(subclass)
     raise Exception, 'Invalid Exporter! Use one of: %s'%([c.__name__ for c in Exporter.__subclasses__()])
示例#11
0
def comments_count_by_day_complete():

    date_dict = comments_count_by_day()

    weekend_comments = []
    weekday_comments = []

    a = comments_count_by_day().items()
    a = sorted(a, key=lambda x: x[0], reverse=False)
    a = [(date, scores[1]) for date, scores in a]

    fig, ax = plt.subplots()
    ax.plot(*zip(*a))
    for i, single_date in enumerate(daterange()):
        print i
        if single_date.weekday() == 5:
            x_list = [single_date, single_date + datetime.timedelta(1)]
            y_list = [date_dict[single_date][1], date_dict[single_date + datetime.timedelta(1)][1]]
            weekend_comments.extend(y_list)
            print x_list
            ax.fill_between(x_list, y_list, alpha=0.3, color='#FF8E8E')
        elif (single_date.weekday() != 5) and (single_date.weekday() != 6):
            weekday_comments.append(date_dict[single_date][1])
            print single_date.weekday()

    ax.set_ylabel(u'Количество комментариев за день')
    ax.set_xlabel(u'Дата')

    red_patch = mpatches.Patch(color='#FF8E8E', alpha=0.4, label=u"Суббота и воскресенье")
    plt.legend(handles=[red_patch], fontsize=14)

    ax.format_xdata = mdates.DateFormatter('%Y-%m-%d')
    fig.autofmt_xdate()  # поворачивает надписи даты
    ax.grid(True)
    for ext in config.get("tex_image_format"):
        plt.savefig(config.get("tex_image_path") + "comments_by_day." + ext,
                format=ext, bbox_inches='tight', dpi=1200)
    plt.show()

    print weekend_comments
    print weekday_comments
    print sum(weekend_comments), len(weekend_comments)
    print sum(weekday_comments), len(weekday_comments)
示例#12
0
 def explain(self):
     p = subprocess.Popen(shlex.split("java -jar {0}/Thesis/senti/SentiStrengthCom.jar stdin noDictionary illegalDoubleLettersInWordMiddle"
                                      " ёйухцчщьыъ illegalDoubleLettersAtWordEnd абвгджзйкоуфхцчщэ UTF8 urlencoded explain sentidata "
                                      "{0}/Thesis/senti/dict/{1}/".format(config.get("home_path"), self.dict_name)),
                          stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     stdout_text, stderr_text = p.communicate(self.text.replace(" ", "+"))
     print self.text.replace(" ", "+")
     if stderr_text:
         raise Exception("Ошибка в модуле оценки эмоций:" + stderr_text)
     return stdout_text
示例#13
0
def perplexity():
    # split into train and test - random sample, but preserving order
    perwordbound_list = []
    perplexity2_list = []
    topics_list = []

    train_size = int(round(len(corpus)*0.9))
    train_index = sorted(random.sample(xrange(len(corpus)), train_size))
    test_index = sorted(set(xrange(len(corpus)))-set(train_index))
    train_corpus = [corpus[i] for i in train_index]
    test_corpus = [corpus[j] for j in test_index]

    for num_topics in range(5, 101, 5):
        model = LDA(dictionary, train_corpus, num_topics, "lda20/lda_training_{num_topics}"
                    .format(num_topics=str(num_topics)), passes=20)
        perwordbound = model.log_perplexity(test_corpus)
        perplexity2 = np.exp2(-perwordbound)

        perwordbound_list.append(perwordbound)
        perplexity2_list.append(perplexity2)
        topics_list.append(num_topics)

    plt.plot(topics_list, perwordbound_list, 'o-')
    plt.ylabel(u'Per word bound')
    plt.xlabel(u'Количество тем')
    plt.grid(True)
    plt.xlim(0, 105)
    for ext in config.get("tex_image_format"):
        plt.savefig(config.get("tex_image_path") + "perplexity_perwordbound." + ext,
                    bbox_inches='tight', format=ext, dpi=1200)
    plt.close()

    plt.plot(topics_list, perplexity2_list, 'o-')
    plt.ylabel(u'Перплексия')
    plt.xlabel(u'Количество тем')
    plt.grid(True)
    plt.xlim(0, 105)
    for ext in config.get("tex_image_format"):
        plt.savefig(config.get("tex_image_path") + "perplexity_exp2." + ext,
                    format=ext, bbox_inches='tight', dpi=1200)
    plt.close()
示例#14
0
def render(filename,**args):
    """ define mako render function """
    try:
        mytemplate = _lookup.get_template(filename) 
        args["sitename"] = config.get("sitename")
        args["cdate"] = datetime.datetime.now().strftime( "%Y-%m-%d")
        args['session'] = web.ctx.session 
        args["ctx"] = web.ctx
        args["current_css"] = current_css
        args["is_select"] = is_select
        return mytemplate.render(**args)
    except:
        return exceptions.text_error_template().render()
示例#15
0
 def watch(self):
     while True:
         for idx, c in enumerate(self.watchlist):
             try:
                 timestamp = time.time()
                 stat = urllib2.urlopen(self.source % c.get('stock')).read().strip()
                 self.watchlist[idx] = self.update_price_history(c, timestamp, float(stat))
                 self.watchlist[idx] = self.update_market_values_for_holdings(c)
             except Exception as ex:
                 print (ex.message)
                 sys.exit(1)
         self.present()
         time.sleep(config.get('update_frequency_sec'))
示例#16
0
    def build_portfolio(self):
        portfolio_data = json.load(open(config.get('paths').get('portfolio_file_path'), 'r'))

        portfolio = []
        for stock in portfolio_data:
            portfolio.append({
                'stock': stock.get('code'),
                'history_live': [{'time': '', 'price': 0.00},],
                'holdings': map(self.stock_record, stock.get('holdings')),
                'announcement_dates': {},
                'news': {},
            })
        return portfolio
示例#17
0
def number_of_docs_by_day():
    date_dict = {date: db.docs_topics.find({"date": date}).count() for date in daterange()}
    print date_dict

    weekend = []
    weekday = []

    fig, ax = plt.subplots()

    for i, single_date in enumerate(daterange()):
        print i
        if single_date.weekday() == 5:
            x_list = [single_date, single_date + datetime.timedelta(1)]
            y_list = [date_dict[single_date], date_dict[single_date + datetime.timedelta(1)]]
            weekend.extend(y_list)
            print x_list
            ax.fill_between(x_list, y_list, alpha=0.3, color='#FF8E8E')
        elif (single_date.weekday() != 5) and (single_date.weekday() != 6):
            weekday.append(date_dict[single_date])
            print single_date.weekday()

    red_patch = mpatches.Patch(color='#FF8E8E', alpha=0.4, label=u"Суббота и воскресенье")
    plt.legend(handles=[red_patch], fontsize=14)

    a = sorted(date_dict.items(), key=lambda x: x[0], reverse=False)
    ax.plot(*zip(*a))
    ax.set_ylabel(u'Количество статей')
    ax.set_xlabel(u'Дата')
    ax.format_xdata = mdates.DateFormatter('%Y-%m-%d')
    fig.autofmt_xdate()  # поворачивает надписи даты
    fig.set_figwidth(11)
    ax.grid(True)
    for ext in config.get("tex_image_format"):
        plt.savefig(config.get("tex_image_path") + "docs_by_day." + ext,
                format=ext, bbox_inches='tight', dpi=1200)
    plt.show()
    print sum(weekend), len(weekend), sum(weekend)/len(weekend)
    print sum(weekday), len(weekday), sum(weekday)/len(weekday)
示例#18
0
def validate_access_token(access_token):
    """
    This request requires validation. To get an access token use the ``/authenticate`` endpoint.
    """

    """
    Check to see if the access_token is valid

    The access token will invalidate if the user changes their password.


    parameters
    ==========

    * access_token - a token created with create_access_token

    returns
    =======

    returns True or False depending on if the access_token is valid
    """
    from settings import config

    validated = False

    try:
        decoded = jwt.decode(access_token, config.get('pepper', ''), algorithms=['HS256'])
    except jwt.DecodeError:
        logger.debug('jwt DecodeError')
    else:
        now = datetime.utcnow()
        then = datetime.fromtimestamp(decoded['iat'])

        age = now - then
        user = User.get_by_id(decoded['user_id'])

        # TODO: make the age configurable
        # for now the access_token is valid for 5 hours
        if age.seconds > 60 * 60 * 5:
            logger.debug('stale access token, timestamp expired')
        elif user and decoded['password'] == user.password:
            validated = {
                'user': user,
            }
            logger.debug('Valid access token for user:{}'.format(user.id))
        else:
            logger.debug('access token failed to validate')

    return validated
示例#19
0
 def senti_tuple(self):
     #open a subprocess using shlex to get the command line string into the correct args list format
     # добавить параметр explain для объяснения вывода изменить на
     p = subprocess.Popen(shlex.split("java -jar {0}/Thesis/senti/SentiStrengthCom.jar stdin noDictionary illegalDoubleLettersInWordMiddle"
                                      " ёйухцчщьыъ illegalDoubleLettersAtWordEnd абвгджзйкоуфхцчщэ UTF8 urlencoded sentidata "
                                      "{0}/Thesis/senti/dict/{1}/".format(config.get("home_path"), self.dict_name)),
                          stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     #communicate via stdin the string to be rated. Note that all spaces are replaced with +
     stdout_text, stderr_text = p.communicate(self.text.replace(" ", "+"))
     #remove the tab spacing between the positive and negative ratings. e.g. 1    -5 -> 1-5
     stdout_text = stdout_text.rstrip().replace("\t","")
     if stderr_text:
         raise Exception("Ошибка в модуле оценки эмоций:" + stderr_text)
     senti_tuple = (int(stdout_text.split("-")[0]), -int(stdout_text.split("-")[1]))
     return senti_tuple
示例#20
0
文件: widgets.py 项目: jhcepas/alot
    def __init__(self, tag):

        self.tag = tag

        #self.translated = self.translated.encode('utf-8')
        #self.txt = urwid.Text(self.translated, wrap='clip')
        #normal = config.get_tagattr(tag)

        normal = config.get_tagattr(tag)
        focus = config.get_tagattr(tag, focus=True)
        self.translated = config.get('tag-translate', tag, fallback=tag)
        self.txt = urwid.Text(self.translated)#, wrap='space')
        self.focus_palette = normal
        self.unfocus_palette = normal
        urwid.AttrMap.__init__(self, self.txt, self.unfocus_palette, self.focus_palette)
示例#21
0
def doc_number():
    width = 0.8
    data = {
        "bk55": 14078,
        "gorod55": 6302,
        "ngs55": 4780,
        "omskinform": 8727
    }
    sources_x = np.arange(len(data))
    numbers = [number for source, number in data.items()]

    fig, ax = plt.subplots()
    ax.set_xlabel(u'Источник')
    ax.set_ylabel(u'Количество статей')
    rects = ax.bar(sources_x, numbers, width)
    plt.xticks(sources_x + 0.4, data.keys())

    for rect, label in zip(rects, data.keys()):
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2, height + 5, height, ha='center', va='bottom')

    for ext in config.get("tex_image_format"):
        plt.savefig(config.get("tex_image_path") + "doc_number." + ext, bbox_inches='tight', format=ext, dpi=1200)
    plt.show()
示例#22
0
文件: message.py 项目: cycomanic/alot
    def get_datestring(self):
        """
        returns reformated datestring for this messages.

        It uses the format spacified by `timestamp_format` in
        the general section of the config.
        """
        if self._datetime == None:
            return None
        if config.has_option('general', 'timestamp_format'):
            formatstring = config.get('general', 'timestamp_format')
            res = self._datetime.strftime(formatstring)
        else:
            res = helper.pretty_datetime(self._datetime)
        return res
示例#23
0
文件: widgets.py 项目: lmacken/alot
    def rebuild(self):
        cols = []
        formatstring = config.get('general', 'timestamp_format')
        newest = self.thread.get_newest_date()
        if formatstring:
            datestring = newest.strftime(formatstring)
        else:
            datestring = pretty_datetime(newest).rjust(10)
        self.date_w = urwid.AttrMap(urwid.Text(datestring), 'threadline_date')
        cols.append(('fixed', len(datestring), self.date_w))

        mailcountstring = "(%d)" % self.thread.get_total_messages()
        self.mailcount_w = urwid.AttrMap(urwid.Text(mailcountstring),
                                   'threadline_mailcount')
        cols.append(('fixed', len(mailcountstring), self.mailcount_w))

        tags = self.thread.get_tags()
        tags.sort()
        for tag in tags:
            tw = TagWidget(tag)
            self.tag_widgets.append(tw)
            cols.append(('fixed', tw.width(), tw))

        authors = self.thread.get_authors() or '(None)'
        maxlength = config.getint('general', 'authors_maxlength')
        authorsstring = shorten(authors, maxlength).strip()
        self.authors_w = urwid.AttrMap(urwid.Text(authorsstring),
                                       'threadline_authors')
        cols.append(('fixed', len(authorsstring), self.authors_w))

        subjectstring = self.thread.get_subject().strip()
        self.subject_w = urwid.AttrMap(urwid.Text(subjectstring, wrap='clip'),
                                 'threadline_subject')
        if subjectstring:
            cols.append(('fixed', len(subjectstring), self.subject_w))

        if self.display_content:
            msgs = self.thread.get_messages().keys()
            msgs.sort()
            lastcontent = ' '.join([m.get_text_content() for m in msgs])
            contentstring = lastcontent.replace('\n', ' ').strip()
            self.content_w = urwid.AttrMap(urwid.Text(contentstring,
                                                      wrap='clip'),
                                           'threadline_content')
            cols.append(self.content_w)

        self.columns = urwid.Columns(cols, dividechars=1)
        self.original_widget = self.columns
示例#24
0
def single_file():
    corpus = etree.Element("corpus")

    for doc in db.find():
        document = etree.SubElement(corpus, "document")
        etree.SubElement(document, "title").text = doc["title"]
        etree.SubElement(document, "content").text = doc["content"]
        etree.SubElement(document, "date").text = str(doc["date"])
        etree.SubElement(document, "url").text = doc["url"]
        etree.SubElement(document, "source").text = doc["source"]
        comments = etree.SubElement(document, "comments", amount=str(len(doc["comments"])))

        for comment in doc["comments"]:
            etree.SubElement(comments, "comment").text = comment

    xml_str = etree.tostring(corpus, pretty_print=True)

    outFile = open("{0}/Thesis/omsk_media.xml".format(config.get("home_path")), 'w')
    outFile.write(xml_str)
    outFile.close()
示例#25
0
def most_commented_topics3():
    """
    Как most_commented_topics2, только умножаем на распространённость темы
    for num, i in enumerate(most_commented_topics3()):
        print "{num} & {topic} & {percent}\\% \\\\".format(num=num+1, topic=i[0], percent=round(i[1]/466794.4839104599 * 100, 1))
    """
    path = "{0}/Thesis/code/output/topics/most_commented_topic".format(config.get("home_path"))
    data_exists = os.path.isfile(path)

    if data_exists:
        fileObject = open(path, 'rb')
        result = pickle.load(fileObject)
    else:
        struct = {}
        result = {}
        topics_distr = general_topic_distribution()

        for doc in db.docs_topics.find():
            for topic, prob in doc["topics"]:
                if topic not in struct:
                    struct[topic] = prob * doc["commentsCount"]
                else:
                    struct[topic] += prob * doc["commentsCount"]

        struct = struct.items()

        for i in struct:
            print i, topics_distr[i[0]]
            result[i[0]] = i[1] / topics_distr[i[0]]

        result = sorted(result.items(), key=lambda x: x[1], reverse=True)
        pickle.dump(result, open(path, "wb"))

    most_commented_value = result[0][1]

    for num, i in enumerate(result):
        print "{num} & {topic_id}. {topic_name} & {percent}\\% \\\\".format(num=num+1,
                                                                            topic_id=i[0],
                                                                            topic_name=config.topics_by_id(i[0]),
                                                                            percent=round(i[1]/most_commented_value * 100, 1))
    return result
示例#26
0
def remove_duplicates(path):
    #TODO не обрабатывается ситуация с разными баллами у одного слова

    path = "{0}/Thesis/senti/dict/{1}/EmotionLookupTable.txt".format(config.get("home_path"), path)
    dict = []
    seen = set()
    output = []

    with open(path, 'r',) as dict_file:
        for num, line in enumerate(dict_file):
            if line.startswith(codecs.BOM_UTF8):
                line = line[3:]
            line = line.strip()
            word = line.split('\t')[0].strip().decode("utf8")
            score = int(line.split('\t')[1])
            dict.append((word, score))
            for item in dict:
                if item[0] not in seen:
                    output.append(item)
                    seen.add(item[0])

    with open(path, 'w') as dict_file:
        for item in output:
            dict_file.write((item[0] + "\t" + str(item[1]) + "\n").encode('utf-8'))
示例#27
0
def create_access_token(user):
    """
    Creates an access token for the user

    parameters
    ==========

    * user (auth.models.User) - a user object

    returns
    =======

    access_token (string)
    """
    from settings import config

    data = {
        'user_id': str(user.id),
        'password': user.password,
        'iat': datetime.utcnow(),
    }
    access_token = jwt.encode(data, config.get('pepper', ''), algorithm='HS256')

    return access_token
示例#28
0
文件: widgets.py 项目: jhcepas/alot
    def rebuild(self):
        cols = []
        # DATE
        formatstring = config.get('general', 'timestamp_format')
        newest = self.thread.get_newest_date()
        if formatstring:
            datestring = newest.strftime(formatstring)
        else:
            datestring = pretty_datetime(newest).rjust(10)
        self.date_w = urwid.AttrMap(urwid.Text(datestring), 'threadline_date')

        # SIZE
        thread_size = self.thread.get_total_messages()
        # Show number of messages only if there are at least 2 mails
        # (save space in the line)
        if thread_size>1 and thread_size<=20:
            charcode = 0x2474 + thread_size
            mailcountstring = unichr(charcode)
        elif thread_size>1 and thread_size>20: 
            mailcountstring = "(%d)" % thread_size
        else:
            mailcountstring = " "

        # TAGS
        tags = self.thread.get_tags()
        tags.sort()
        tagstrings = []
        for tag in tags:
            tw = TagWidget(tag)
            self.tag_widgets.append(tw)
            tagstrings.append(('fixed', tw.width(), tw))
            
        # AUTHORS
        authors_string = self.thread.get_authors() or '(None)'
        maxlength = config.getint('general', 'authors_maxlength')
        authorsstring = shorten_author_string(authors_string, maxlength - len(mailcountstring))
        offset = maxlength - len(authorsstring)
        mailcountstring = mailcountstring.rjust(offset)
        self.mailcount_w = urwid.AttrMap(urwid.Text(mailcountstring),
                                   'threadline_mailcount')

        self.authors_w = urwid.AttrMap(urwid.Text(authorsstring),
                                       'threadline_authors')

        # SUBJECT
        subjectstring = self.thread.get_subject().strip()
        self.subject_w = urwid.AttrMap(urwid.Text(subjectstring, wrap='clip'),
                                 'threadline_subject')

        # BODY
        if self.display_content:
            msgs = self.thread.get_messages().keys()
            msgs.sort()
            lastcontent = ' '.join([m.get_text_content() for m in msgs])
            contentstring = lastcontent.replace('\n', ' ').strip()
            self.content_w = urwid.AttrMap(urwid.Text(contentstring,
                                                      wrap='clip'),
                                           'threadline_content')

        # Set column order
        #self.select = urwid.AttrMap(urwid.Text("[ ] ", wrap='clip'),
        #                            'threadline_subject')
        #cols.append(('fixed', 4, self.select))
        cols.append(('fixed', len(datestring), self.date_w))
        cols.append(('fixed', len(authorsstring), self.authors_w))
        cols.append(('fixed', len(mailcountstring), self.mailcount_w))

        cols.extend(tagstrings)

        if subjectstring:
            cols.append(('fixed', len(subjectstring), self.subject_w))
        if self.display_content:
            cols.append(self.content_w)

        self.columns = urwid.Columns(cols, dividechars=1)
        self.original_widget = self.columns
示例#29
0
# -*- coding: utf-8 -*-
from __future__ import division
from settings import config

from gensim import corpora, models, similarities, matutils
import numpy as np
import scipy.stats as stats
import pickle
import matplotlib.pyplot as plt

from matplotlib import rc
font = {
    'family': config.get("tex_font_family"),
    'weight': 'normal',
    'size': config.get("tex_font_size")
}
rc('font', **font)

import scipy.sparse
from sparsesvd import sparsesvd

from pymongo import MongoClient
db = MongoClient().thesis
raw_tokens = db.raw_tokens

from analysis import LDA, get_corpus, get_dictionary

# Define KL function
def sym_kl(p, q):
    return np.sum([stats.entropy(p, q), stats.entropy(q, p)])
示例#30
0
文件: widgets.py 项目: 0x64746b/alot
    def rebuild(self):
        cols = []
        if self.thread:
            newest = self.thread.get_newest_date()
        else:
            newest = None
        if newest == None:
            datestring = u' ' * 10
        else:
            formatstring = config.get('general', 'timestamp_format')
            if formatstring:
                datestring = newest.strftime(formatstring)
            else:
                datestring = pretty_datetime(newest).rjust(10)
        self.date_w = urwid.AttrMap(urwid.Text(datestring),
                                    'search_thread_date')
        cols.append(('fixed', len(datestring), self.date_w))

        if self.thread:
            mailcountstring = "(%d)" % self.thread.get_total_messages()
        else:
            mailcountstring = "(?)"
        self.mailcount_w = urwid.AttrMap(urwid.Text(mailcountstring),
                                   'search_thread_mailcount')
        cols.append(('fixed', len(mailcountstring), self.mailcount_w))

        if self.thread:
            self.tag_widgets = [TagWidget(t) for t in self.thread.get_tags()]
        else:
            self.tag_widgets = []
        self.tag_widgets.sort(tag_cmp,
                              lambda tag_widget: tag_widget.translated)
        for tag_widget in self.tag_widgets:
            cols.append(('fixed', tag_widget.width(), tag_widget))

        if self.thread:
            authors = self.thread.get_authors() or '(None)'
        else:
            authors = '(None)'
        maxlength = config.getint('general', 'authors_maxlength')
        authorsstring = shorten_author_string(authors, maxlength)
        self.authors_w = urwid.AttrMap(urwid.Text(authorsstring),
                                       'search_thread_authors')
        cols.append(('fixed', len(authorsstring), self.authors_w))

        if self.thread:
            subjectstring = self.thread.get_subject().strip()
        else:
            subjectstring = ''
        self.subject_w = urwid.AttrMap(urwid.Text(subjectstring, wrap='clip'),
                                 'search_thread_subject')
        if subjectstring:
            cols.append(('weight', 2, self.subject_w))

        if self.display_content:
            if self.thread:
                msgs = self.thread.get_messages().keys()
            else:
                msgs = []
            msgs.sort()
            lastcontent = ' '.join([m.get_text_content() for m in msgs])
            contentstring = lastcontent.replace('\n', ' ').strip()
            self.content_w = urwid.AttrMap(urwid.Text(contentstring,
                                                      wrap='clip'),
                                           'search_thread_content')
            cols.append(self.content_w)

        self.columns = urwid.Columns(cols, dividechars=1)
        self.original_widget = self.columns