Пример #1
0
def get_status_and_title(link, x):
    # title
    title = utils.get_title(x, selectors)
    if title.text.find("shared a memory") != -1:
        x = x.find_element_by_xpath(selectors.get("title_element"))
        title = utils.get_title(x, selectors)
    status = utils.get_status(x, selectors)
    if title.text == driver.find_element_by_id(
            selectors.get("title_text")).text:
        if status == "":
            temp = utils.get_div_links(x, "img", selectors)
            if temp == "":  # no image tag which means . it is not a life event
                link = utils.get_div_links(x, "a",
                                           selectors).get_attribute("href")
                post_type = "status update without text"
            else:
                post_type = "life event"
                link = utils.get_div_links(x, "a",
                                           selectors).get_attribute("href")
                status = utils.get_div_links(x, "a", selectors).text
        else:
            post_type = "status update"
            if utils.get_div_links(x, "a", selectors) != "":
                link = utils.get_div_links(x, "a",
                                           selectors).get_attribute("href")

    elif title.text.find(" shared ") != -1:
        x1, link = utils.get_title_links(title)
        post_type = "shared " + x1
    elif title.text.find(" at ") != -1 or title.text.find(" in ") != -1:
        if title.text.find(" at ") != -1:
            x1, link = utils.get_title_links(title)
            post_type = "check in"
        elif title.text.find(" in ") != 1:
            status = utils.get_div_links(x, "a", selectors).text
    elif title.text.find(" added ") != -1 and title.text.find("photo") != -1:
        post_type = "added photo"
        link = utils.get_div_links(x, "a", selectors).get_attribute("href")

    elif title.text.find(" added ") != -1 and title.text.find("video") != -1:
        post_type = "added video"
        link = utils.get_div_links(x, "a", selectors).get_attribute("href")

    else:
        post_type = "others"
    if not isinstance(title, str):
        title = title.text
    status = status.replace("\n", " ")
    title = title.replace("\n", " ")
    return link, status, title, post_type
Пример #2
0
 def post_content(self, kw, cnt, bar):
     '''
         res:{'opgroup': '0', 'pid': '125867227452', 'tid': '6147431000', 'msg': '发送成功', 'pre_msg': '经验 ', 'info': {'access_state': [], 'confilter_hitwords': [], 'need_vcode': '0', 'vcode_md5': '7555x/KllzCmyK+jbZ9frCkGvrEKm/lvsIWXiJNGWK/4Z2lzOtCPczDKRsCjCJnP', 'vcode_prev_type': '0', 'vcode_type': '0', 'pass_token': ''}, 'time': 1559196367, 'ctime': 0, 'logid': 367165643, 'error_code': '0', 'server_time': '569751'}
     '''
     contents = self.convert_to_contents(cnt, kw)
     fid = get_fid(kw)
     title = '{}{}'.format(kw, get_title())
     if check(self.bduss):
         res = client_thread_add(self.bduss, kw, fid, contents[0], title)
         print(res)
         time.sleep(30)
         if 'msg' in res and res['msg'] != "发送成功":
             print('发帖失败{}'.format(kw))
             return
         print('{}发帖成功'.format(kw))
         tid = res['tid']
         for cont in contents[1:]:
             post = client_Post(self.bduss, kw, tid, fid, cont)
             if 'error_msg' in post:
                 print('回帖失败')
                 return
             print(post)
             time.sleep(15)
         print('{}回帖成功'.format(kw))
         bar.hassend = True
         self.session.commit()
         time.sleep(100)
Пример #3
0
def write_query_set_folder(db, prefix) :
	'''
	Load queries from the prefix.txt, get citations for them
	and write each to a single file under folder prefix
	'''
	# Create folder if it doesn't exist
	if not os.path.exists(prefix) :
		os.mkdir(prefix)

	queries_file_path = prefix + ".txt"
	with open(queries_file_path, 'r') as file :

		for line in file :
			pub_id, year, title, _query_ = line.strip().split('\t')


			file_path = "%s/%s.txt" % (prefix, pub_id)

			citations = db.select("cited", table="graph", where="citing='%s'"%pub_id)

			# Write seed document id and then one citation per line
			with open(file_path, 'w') as citations_file :
				print >> citations_file, "%s\t%s\t%s" % (pub_id, year, title)
				for cited in citations:
					title = utils.get_title(db, cited).strip()
					print >> citations_file, "%s\t%s\t%s" % ("R1", cited, title.encode("UTF-8"))
Пример #4
0
def run(play_path, stats):
    gender_path, output_path, output_path_base, play_name = get_paths(play_path)
    # print(play_name)
    raw_play_lines, gender = get_files(play_path, gender_path)
    stats[play_name] = {'title' : get_title(raw_play_lines)}
    play_stats= stats[play_name]
    output = process_play(raw_play_lines, gender, play_stats)
Пример #5
0
def generate_tweet_text(hilt, blade, pommel):
    hilt_details = MANIFEST["hilt"][hilt]
    blade_details = MANIFEST["blade"][blade]
    pommel_details = MANIFEST["pommel"][pommel]

    hilt_length = hilt_details["length"]
    pommel_length = pommel_details["length"]

    total_length = hilt_length + pommel_length
    average_length = AVERAGE_HILT_LENGTH + AVERAGE_POMMEL_LENGTH
    blade_length = int(AVERAGE_BLADE_LENGTH * (total_length / average_length))

    if DOUBLE_BLADE:
        total_length = hilt_length * 2
        blade_length *= 2

    title = get_title(blade_details)

    crystal = get_crystal(blade_details)

    name = f"{title} {random.choice(NAMES)}"

    tweet = f"""Owner: {name}
Hilt Length: {total_length} cm
Blade Length: {blade_length} cm
Blade Colour: {MANIFEST['blade'][blade]['colour']}
Kyber Crystal: {crystal}

#StarWars #lightsaber #{title}
"""

    return tweet
Пример #6
0
def generate():
    posts = utils.get_posts()
    ppp = config['posts_per_page']
    pages = int(math.ceil(float(len(posts)) / ppp))

    utils.clear_dir('site/page')
    for i in range(pages):
        page_content = render_template('frontend/index.html',
                                       config=config,
                                       frontend=True,
                                       current=i + 1,
                                       first=(i == 0),
                                       last=(i == pages - 1),
                                       posts=posts[i * ppp:(i + 1) * ppp])
        file('site/page/%s.html' % (i + 1), 'w').write(
                                    page_content.encode(config['encoding']))
        if i == 0:
            file('site/index.html', 'w').write(
                                    page_content.encode(config['encoding']))

    not_found_content = render_template('404.html',
                                        config=config,
                                        frontend=True)
    file('site/404.html', 'w').write(
                                not_found_content.encode(config['encoding']))

    utils.clear_dir('site/posts')
    infos = utils.get_post_infos()

    feed = AtomFeed(config['title'],
                    feed_url=config['url_root'] + '/posts.atom',
                    url=config['url_root'])
    for info in infos:
        with open('posts/%s' % info['filename'], 'r') as f:
            content = f.read().decode(config['encoding'])
            title = utils.get_title(content)
            content = utils.postprocess_post_content(info['slug'],
                                                            content, False)
            html_content = render_template('frontend/post.html',
                                           config=config,
                                           frontend=True,
                                           title=title,
                                           content=content)
            file('site/posts/%s.html' % info['slug'], 'w').write(
                                    html_content.encode(config['encoding']))

            feed_content = render_template('feed.html',
                                           config=config,
                                           content=content)
            feed.add(title, feed_content, content_type='html',
                     url=make_external('/posts/' + info['slug']),
                     author='Tony Wang',
                     published=utils.date_localize_from_utc(info['time'],
                                                            True),
                     updated=utils.date_localize_from_utc(info['time'], True))

    file('site/posts.atom', 'w').write(str(feed.get_response().iter_encoded(config['encoding']).next()))

    return 'Done!'
Пример #7
0
 def edit(self):
     self.show()
     print("EDIT entry (Leave fields blank for no changes)")
     self.title = utils.get_title(self.title)
     self.date = utils.get_date(self.date)
     self.time = utils.get_time(self.time)
     self.notes = utils.get_notes(self.notes)
     self.save()
Пример #8
0
 def add_task(cls):
     """Add new entry"""
     employee, _ = models.Employee.get_or_create(name=utils.get_name())
     task = models.Task.create(employee=employee,
                               title=utils.get_title(),
                               time=utils.get_time(),
                               notes=utils.get_notes())
     task.show()
     input("The entry has been added. Press enter to return to the menu")
Пример #9
0
 def get(self, key, default=None):
     """
     Access attributes of the item. If the attribute is not found
     the default value (None) will be returned.
     """
     if key.startswith('tmp:'):
         return self._beacon_tmpdata.get(key[4:], default)
     if key == 'parent':
         return self._beacon_parent
     if key == 'media':
         return self._beacon_media
     if key == 'read_only':
         # FIXME: this is not correct, a directory can also be
         # read only on a rw filesystem.
         return self._beacon_media.get('volume.read_only', default)
     if key in ('image', 'thumbnail'):
         image = self._beacon_data.get('image')
         if not image:
             if self._beacon_parent and self._beacon_id:
                 # This is not a good solution, maybe the parent is
                 # not up to date. Well, we have to live with that
                 # for now.  Only get image from parent if the item
                 # is scanned because it is a very bad idea that
                 # unscanned images (we do not know that they are
                 # images yet) inherit the image from a directory.
                 image = self._beacon_parent.get('image')
             if not image:
                 return default
         if image.startswith('http://'):
             fname = self._beacon_controller._db.md5url(image, 'images')
             if key == 'image':
                 if not os.path.isfile(fname):
                     # FIXME: We need to fetch the image. Right now this will not happen
                     # until beacon restarts or a thumbnail is requested
                     return default
                 return fname
             if key == 'thumbnail':
                 # the thumbnail code will take care of downloading
                 return Thumbnail(image, self._beacon_media)
         if key == 'image':
             return image
         if key == 'thumbnail':
             return Thumbnail(image, self._beacon_media)
     if key == 'title':
         t = self._beacon_data.get('title')
         if t:
             return t
         # generate some title and save local it for future use
         t = kaa.str_to_unicode(
             get_title(self._beacon_data['name'], self.isfile))
         self._beacon_data['title'] = t
         return t
     result = self._beacon_data.get(key, default)
     if result is None:
         return default
     return result
Пример #10
0
def index():
    infos = utils.get_post_infos()

    for info in infos:
        with open('posts/%s' % info['filename'], 'r') as f:
            content = f.read().decode(config['encoding'])
            title = utils.get_title(content)
            info['title'] = title
            info['date'] = utils.date_localize_from_utc(info['time'])
    return render_template('admin/index.html', config=config, infos=infos)
Пример #11
0
 def get(self, key, default=None):
     """
     Access attributes of the item. If the attribute is not found
     the default value (None) will be returned.
     """
     if key.startswith('tmp:'):
         return self._beacon_tmpdata.get(key[4:], default)
     if key == 'parent':
         return self._beacon_parent
     if key == 'media':
         return self._beacon_media
     if key == 'read_only':
         # FIXME: this is not correct, a directory can also be
         # read only on a rw filesystem.
         return self._beacon_media.get('volume.read_only', default)
     if key in ('image', 'thumbnail'):
         image = self._beacon_data.get('image')
         if not image:
             if self._beacon_parent and self._beacon_id:
                 # This is not a good solution, maybe the parent is
                 # not up to date. Well, we have to live with that
                 # for now.  Only get image from parent if the item
                 # is scanned because it is a very bad idea that
                 # unscanned images (we do not know that they are
                 # images yet) inherit the image from a directory.
                 image = self._beacon_parent.get('image')
             if not image:
                 return default
         if image.startswith('http://'):
             fname = self._beacon_controller._db.md5url(image, 'images')
             if key == 'image':
                 if not os.path.isfile(fname):
                     # FIXME: We need to fetch the image. Right now this will not happen
                     # until beacon restarts or a thumbnail is requested
                     return default
                 return fname
             if key == 'thumbnail':
                 # the thumbnail code will take care of downloading
                 return Thumbnail(image, self._beacon_media)
         if key == 'image':
             return image
         if key == 'thumbnail':
             return Thumbnail(image, self._beacon_media)
     if key == 'title':
         t = self._beacon_data.get('title')
         if t:
             return t
         # generate some title and save local it for future use
         t = kaa.str_to_unicode(get_title(self._beacon_data['name'], self.isfile))
         self._beacon_data['title'] = t
         return t
     result = self._beacon_data.get(key, default)
     if result is None:
         return default
     return result
Пример #12
0
 def edit(self):
     """
     Let the user to edit a task by being asked to edit any of their
     attributes. If any field is left blank, then it wont be changed.
     """
     self.show()
     print("EDIT entry (Leave fields blank for no changes)")
     self.title = utils.get_title(self.title)
     self.date = utils.get_date(self.date)
     self.time = utils.get_time(self.time)
     self.notes = utils.get_notes(self.notes)
def gen_filename(record):
    """
    Guess the expected filename from the record.

    Args:
        record (dict): a record of the bibtex entry.

    Returns:
        A string which corresponds to guessed filename (expected to be a pdf).
    """
    record_copy = record.copy()
    record_copy = bibtexparser.customization.author(record_copy)

    # Retrieve a stripped down last name of the first authors
    last_names = []
    for author in record_copy['author']:
        stripped = utils.strip_accents(codecs.decode(author, "ulatex"))
        name = re.sub('([\\{\\}])', '', stripped.split(',')[0])
        name = re.sub('~', ' ', name)
        name = re.sub("\\\\'ı", "i", name)
        name = re.sub("\\\\`ı", "i", name)
        name = re.sub("ı", "i", name)
        name = re.sub('\xf8', 'o', name)
        name = re.sub('\\\\textquotesingle ', "'", name)
        name = name.replace('ł', 'l')
        last_names.append(name)

    # If there are more than 4 authors, use the 'et al.' form
    if len(last_names) > 4:
        prefix = '(' + last_names[0] + ' et al.) '
    else:
        prefix = '(' + ', '.join(last_names) + ') '

    title = utils.get_title(record_copy)
    title = title.replace('$\\Lambda_{훜fty}$ ', 'λ∞')
    title = re.sub('\\\\textendash  ', '- ', title)
    title = utils.strip_accents(codecs.decode(title, "ulatex"))
    title = re.sub('([\\{\\}])', '', title)
    title = re.sub(' *: ', ' - ', title)
    title = re.sub(' *— *', ' - ', title)
    title = re.sub('–', '-', title)
    title = re.sub('/', '-', title)
    # title = re.sub('\\$\\mathplus \\$', '+', title)
    title = re.sub('\\\\textquotesingle ', "'", title)
    title = to_titlecase(title)
    title = re.sub('"', '', title)
    title = re.sub('’', "'", title)
    title = re.sub('\u2010', '-', title)
    title = re.sub('\u2122', '', title)
    title = title.replace('$\\texttt FreeFem++$', 'FreeFem++')
    title = title.replace('$\\lambda _\\Infty $ ', 'λ∞')

    return prefix + title + '.pdf'
Пример #14
0
def gen_filename(record):
    """
    Guess the expected filename from the record.

    Args:
        record (dict): a record of the bibtex entry.

    Returns:
        A string which corresponds to guessed filename (expected to be a pdf).
    """
    record_copy = record.copy()
    record_copy = bibtexparser.customization.author(record_copy)

    # Retrieve a stripped down last name of the first authors
    last_names = []
    for author in record_copy['author']:
        stripped = utils.strip_accents(codecs.decode(author, "ulatex"))
        name = re.sub('([\\{\\}])', '', stripped.split(',')[0])
        name = re.sub('~', ' ', name)
        name = re.sub("\\\\'ı", "i", name)
        name = re.sub("\\\\`ı", "i", name)
        name = re.sub("ı", "i", name)
        name = re.sub('\xf8', 'o', name)
        name = re.sub('\\\\textquotesingle ', "'", name)
        name = name.replace('ł', 'l')
        last_names.append(name)

    # If there are more than 4 authors, use the 'et al.' form
    if len(last_names) > 4:
        prefix = '(' + last_names[0] + ' et al.) '
    else:
        prefix = '(' + ', '.join(last_names) + ') '

    title = utils.get_title(record_copy)
    title = title.replace('$\\Lambda_{훜fty}$ ', 'λ∞')
    title = re.sub('\\\\textendash  ', '- ', title)
    title = utils.strip_accents(codecs.decode(title, "ulatex"))
    title = re.sub('([\\{\\}])', '', title)
    title = re.sub(' *: ', ' - ', title)
    title = re.sub(' *— *', ' - ', title)
    title = re.sub('–', '-', title)
    title = re.sub('/', '-', title)
    # title = re.sub('\\$\\mathplus \\$', '+', title)
    title = re.sub('\\\\textquotesingle ', "'", title)
    title = to_titlecase(title)
    title = re.sub('"', '', title)
    title = re.sub('’', "'", title)
    title = re.sub('\u2010', '-', title)
    title = re.sub('\u2122', '', title)
    title = title.replace('$\\texttt FreeFem++$', 'FreeFem++')
    title = title.replace('$\\lambda _\\Infty $ ', 'λ∞')

    return prefix + title + '.pdf'
Пример #15
0
 def __init__(self, **kwargs):
     """Initialize an instance of Task with needed attributes"""
     if kwargs:
         self.title = kwargs.get('Title')
         self.date = datetime.datetime.strptime(kwargs.get('Date'),
                                                '%d/%m/%Y').date()
         self.time = kwargs.get('Time')
         self.notes = kwargs.get('Notes')
     else:
         self.title = utils.get_title()
         self.date = utils.get_date()
         self.time = utils.get_time()
         self.notes = utils.get_notes()
def gen_bibkey(record, all_keys):
    """
    Generate a unique bibtex key for the given record.

    Args:
        record (dict): a record of the bibtex entry.
        all_keys (set): a set of existing bibtex keys in the current context.

    Returns:
        A string which corresponds to the newly generated unique bibtex key.
        The argument 'all_keys' is also appended with the new key.
    """
    for field in ['year', 'title', 'author']:
        if field not in record:
            record_str = json.dumps(record,
                                    sort_keys=True,
                                    indent=4,
                                    separators=(',', ': '))
            raise ValueError(
                "Missing field '{0}' in bibtex entry:\n{1}".format(
                    field, record_str))

    record_copy = record.copy()
    record_copy = bibtexparser.customization.author(record_copy)

    # Retrieve a stripped down last name of the first author
    first_author = record_copy['author'][0]
    stripped = utils.strip_accents(codecs.decode(first_author, "ulatex"))
    last_name = stripped.split(',')[0]
    last_name = last_name.replace('ø', 'o')
    last_name = last_name.replace('ł', 'l')
    last_name = re.sub('([^a-zA-Z])', '', last_name)

    # Then get the first 3 initials of the article title
    curated_title = re.sub('([^a-zA-Z])', ' ', utils.get_title(record_copy))
    short_title = ''.join(s[0] for s in curated_title.split())
    short_title += curated_title.split()[-1][1:]
    short_title = short_title[:3].upper()

    # Key is Author:Year:Initials
    basekey = last_name + ":" + record_copy['year'] + ":" + short_title
    bibkey = basekey

    # Assign a unique key
    tail = 'a'
    while bibkey in all_keys:
        bibkey = basekey + tail
        tail = chr((ord(tail) + 1))

    all_keys.add(bibkey)
    return bibkey
Пример #17
0
def browse_papers(path_, csv_file, fout):
    fo = open(fout, 'w')
    print("Processing citations ...")
    dict_1, dict_2 = parse_csv_file(csv_file)

    print("Processing files ...")

    tmp_list = []
    for root, dirs, files in os.walk(path_):
        for name in files:
            if name.endswith((".json")):
                jfile = root + "/" + name
                data = json.load(open(jfile))

                year, month, day = get_date_jsonfile(jfile, data)
                journal = get_journal_short_json(jfile, data)
                issue, volume = get_issue_volume(jfile, data)
                doi = get_doi(jfile, data)
                num_pages = get_number_of_pages(jfile, data)
                coauthors = get_coauthors_jsonfile(jfile, data)
                affiliations = get_all_affiliations(jfile, data)
                countries = get_all_countries(jfile, data)
                title = get_title(jfile, data)

                str_out = ""
                str_out += str(year) + " "
                str_out += str(month) + " "
                str_out += str(day) + " "
                str_out += str(journal) + " "
                str_out += str(issue) + " "
                str_out += str(volume) + " "
                str_out += str(doi) + " "
                str_out += str(len(coauthors)) + " "
                str_out += str(len(affiliations)) + " "
                str_out += str(len(countries)) + " "
                str_out += str(len(title)) + " "
                str_out += str(num_pages) + " "

                if doi in dict_1.keys():
                    str_out += str(len(dict_1[doi])) + " "
                else:
                    str_out += str(0) + " "

                if doi in dict_2.keys():
                    str_out += str(len(dict_2[doi])) + " "
                else:
                    str_out += str(0) + " "

                fo.write(str_out + "\n")

    fo.close()
Пример #18
0
def main():
    #TODO: add support for mysql
    for subgroup in d.available_subgroups:
        try:
            cursor.execute("SELECT * FROM %s;" %(subgroup['db_name']))
            anime_list = cursor.fetchall()
        except Exception as e:
            logging.error(e)
        
        for anime in anime_list:
            #Gets rss feed with params
            search_param = ' '.join(anime)

            i = 1
            while True:
                feed_url = ('%s?page=rss&term=%s&user=%s&offset=%d' %(nyaa_url, search_param, subgroup['nyaa_id'], i)).replace(' ', '+')
                i = i + i
            
                feed = feedparser.parse(feed_url)
                if not feed['entries']:
                    break
                #pprint(feed)
                
                for feed_entry in feed['entries']:
                    feed_title = feed_entry['title']
                    parsed_title = utils.get_title(feed_title, subgroup['regex']['title'])
                    if parsed_title == anime[0]:
                        parsed_episode = utils.get_episode(feed_title, subgroup['regex']['episode'])
                        if parsed_episode:
                            cursor.execute("SELECT 1 FROM downloaded WHERE title='%s' AND episode='%s' AND subgroup='%s';" %(anime[0].replace("'", "''"), parsed_episode, subgroup['subgroup']))
                            if not cursor.fetchone():
                                dl_location = cfg.dl_location + anime[0]
                                if not os.path.exists(dl_location):
                                    os.mkdir(dl_location)

                                download_queue.append({'torrent': download_torrent(anime[0], feed_entry['link']), 'info': {'title': anime[0], 'episode': parsed_episode, 'quality': anime[1], 'subgroup': subgroup['subgroup']}})
                                cursor.execute("INSERT INTO downloaded VALUES('%s', '%s', '%s', '%s', '%s');" %(anime[0].replace("'", "''"), parsed_episode, datetime.datetime.now().isoformat(), subgroup['subgroup'], 'Downloading'))
                                connection.commit()

    # spawn child thread here to monitor downloads.
    while len(download_queue):
        for torrent in download_queue:
            torrent_obj = tc.get_torrent(torrent['torrent'])
            if torrent_obj.status == 'seeding':
                logging.info('%s completed.' %(torrent_obj.name))
                cursor.execute("UPDATE downloaded SET status='Completed' WHERE title='%s' AND episode='%s' AND subgroup='%s';" %(torrent['info']['title'].replace("'", "''"), torrent['info']['episode'], torrent['info']['subgroup']))
                connection.commit()
                download_queue.remove(torrent)
            sleep(1)
    connection.commit()
    connection.close()
Пример #19
0
    def __init__(self, link, user, title = None, points = 0, domain = "", submitted = datetime.now()):
        self.link = link
        self.title = title
        self.user = user
        self.points = points
        self.submitted = submitted

        if title:
            self.title = title
        else:
            self.title = get_title(link)

        parsed_uri = urlparse(link)
        self.domain =  '{uri.netloc}'.format(uri=parsed_uri)
Пример #20
0
def summarize(page_id, results_str, features):
    print 'results for %s (id=%d)' % (utils.get_title(page_id).encode(
        'ascii', 'ignore'), page_id)
    if len(results_str) < 2:
        print '\tno results'
        return
    i = 1
    results = []
    ranks_by_score = collections.defaultdict(list)
    for pair in results_str[1:-1].split('|'):
        tokens = pair.split(',')
        page_id2 = int(tokens[0])
        score = float(tokens[1])
        used_features = tokens[2] if len(tokens) == 3 else ''
        results.append([page_id2, score, used_features])
        ranks_by_score[score].append(i)
        i += 1

    ranks_to_show = set()
    r = 1
    while r <= len(results):
        for i in range(r, r + 3):
            ranks_to_show.add(i)
        r *= 2

    ranks_to_show = [r for r in sorted(ranks_to_show) if r <= len(results)]
    for rank in ranks_to_show:
        (page_id2, score, used_features) = results[rank - 1]
        tie = ''
        if len(ranks_by_score[score]) > 1:
            tie = ', %d-way tie' % len(ranks_by_score[score])
        feature_info = ''
        if features and used_features:
            feature_info = ', %s:%s' % (used_features, features)
        print(u'\t%.5d: %s (id=%d, score=%.3f%s%s)' %
              (rank, utils.get_title(page_id2), page_id2, score, tie,
               feature_info)).encode('utf-8')
Пример #21
0
def get_all_websphere(page=None):
    """
    分页获取websphere的信息列表
    :param page: 当前分页
    :return: 分页后的结果列表及分页信息
    """
    app.logger.debug("run into get_all_websphere function")
    page = request.args.get('page', 1, type=int)
    paginate = db.session.query(WebSphere, System).join(System).order_by(
        System.inventory).paginate(page, NUM_PER_PAGE)
    was_list_in = paginate.items
    return render_template("all_websphere.html",
                           title=get_title("WebSphere信息列表"),
                           pagination=paginate,
                           was_list=was_list_in)
Пример #22
0
def get_all_db2(page=None):
    """
    分页获取DB2的信息列表
    :param page: 当前分页
    :return: 分页后的结果列表及分页信息
    """
    app.logger.debug("run into get_all_db2 function")
    page = request.args.get('page', 1, type=int)
    paginate = db.session.query(DB2, System).join(System).paginate(
        page, NUM_PER_PAGE)
    db2_list_in = paginate.items
    return render_template("all_db2.html",
                           title=get_title("DB2信息列表"),
                           pagination=paginate,
                           db2_list=db2_list_in)
Пример #23
0
def summarize(page_id, results_str, features):
    print 'results for %s (id=%d)' % (utils.get_title(page_id).encode('ascii', 'ignore'), page_id)
    if len(results_str) < 2:
        print '\tno results'
        return
    i = 1
    results = []
    ranks_by_score = collections.defaultdict(list)
    for pair in results_str[1:-1].split('|'):
        tokens = pair.split(',')
        page_id2 = int(tokens[0])
        score = float(tokens[1])
        used_features = tokens[2] if len(tokens) == 3 else ''
        results.append([page_id2, score, used_features])
        ranks_by_score[score].append(i)
        i += 1

    ranks_to_show = set()
    r = 1
    while r <= len(results):
        for i in range(r,r+3):
            ranks_to_show.add(i)
        r *= 2
    
    ranks_to_show = [r for r in sorted(ranks_to_show) if r <= len(results)]
    for rank in ranks_to_show:
        (page_id2, score, used_features) = results[rank-1]
        tie = ''
        if len(ranks_by_score[score]) > 1:
            tie = ', %d-way tie' % len(ranks_by_score[score])
        feature_info = ''
        if features and used_features:
            feature_info = ', %s:%s' % (used_features, features)
        print (u'\t%.5d: %s (id=%d, score=%.3f%s%s)'
                % (rank, utils.get_title(page_id2), page_id2, score, tie, feature_info)
            ).encode('utf-8')
Пример #24
0
def lambda_handler(event, context):
    options = FirefoxOptions()
    options.headless = True
    options.binary_location = '/usr/local/bin/firefox'

    driver = webdriver.Firefox(executable_path='/usr/local/bin/geckodriver',
                               log_path='/tmp/geckodriver.log',
                               firefox_options=options)

    title = get_title(driver)
    print(title)

    driver.quit()

    return {'statusCode': 200, 'body': json.dumps("LGTM")}
Пример #25
0
def edit(time_slug):
    filename = '%s.md' % time_slug
    file_path = 'posts/%s' % filename
    if not os.path.exists(file_path):
        # TODO: 404 page
        return '', 404
    if request.method == 'GET':
        info = utils.parse_filename(filename)
        with open(file_path) as f:
            content = f.read().decode(config['encoding'])
        info['title'] = utils.get_title(content)
        info['content'] = '\n'.join(content.splitlines()[4:])
        info['date'] = utils.date_localize_from_utc(info['time'])

        return render_template('admin/edit.html',
                               config=config,
                               info=info)
    elif request.method == 'POST':
        result = utils.parse_filename(filename)
        if not result:
            return '', 404
        title = request.form['title'].strip()
        date = request.form['date'].strip()
        content = request.form['content'].strip()
        slug = request.form['slug'].strip()

        try:
            post_time = utils.datetime2epoch(date)
        except ValueError:
            # TODO: flash message
            return '', 404
        time_str = utils.date_localize_from_utc(post_time)

        file_to_remove = None
        if post_time != result['time'] or slug != result['slug']:
            file_to_remove = file_path
            file_path = 'posts/%s-%s.md' % (post_time, slug)

        file_content = content_template % {'title': title,
                                           'time': time_str,
                                           'content': content}
        with open(file_path, 'w') as f:
            f.write(file_content.encode(config['encoding']))

        if file_to_remove:
            os.remove(file_to_remove)

        return redirect('/')
Пример #26
0
def get_filter_system(inventory_filter=None, os_filter=None):
    """
    获取过滤后的系统信息,可以根据(inventory/os)进行过滤
    :param inventory_filter: IP过滤器
    :param os_filter: 操作系统类型过滤器
    :return: details.html
    """
    app.logger.debug("filter")
    sys_was_count_list = []
    sys_db2_count_list = []
    if request.method == 'POST':
        inventory_filter = request.form['inventory_filter']
        os_filter = request.form['os_filter']
        app.logger.debug("POST")
    elif request.method == 'GET':
        inventory_filter = request.args.get('inventory_filter')
        os_filter = request.args.get('os_filter')
        app.logger.debug("GET")
    app.logger.debug("inventory_filter: {0}".format(inventory_filter))
    app.logger.debug("os_filter: {0}".format(os_filter))
    page = request.args.get('page', 1, type=int)
    # 对结果进行分页
    if os_filter == "all":
        paginate = System.query.filter(
            System.inventory.like("%{0}%".format(inventory_filter))).paginate(
                page, NUM_PER_PAGE)
    else:
        paginate = System.query.filter(System.inventory.like("%{0}%".format(inventory_filter))). \
            filter(System.os_info == str(os_filter)).paginate(page, NUM_PER_PAGE)
    systems = paginate.items
    for one_system in systems:
        sys_was_count = WebSphere.query.filter_by(
            sys_inventory=one_system.inventory).count()
        sys_db2_count = DB2.query.filter_by(
            sys_inventory=one_system.inventory).count()
        sys_was_count_list.append(sys_was_count)
        sys_db2_count_list.append(sys_db2_count)
    db.session.close()
    app.logger.debug(systems)
    return render_template("all_system.html",
                           inventory_filter_val=inventory_filter,
                           title=get_title("主机信息列表"),
                           system_list=systems,
                           pagination=paginate,
                           os_filter_val=os_filter,
                           os_list_val=get_os_list(),
                           sys_was_count_list=sys_was_count_list,
                           sys_db2_count_list=sys_db2_count_list)
Пример #27
0
def gen_bibkey(record, all_keys):
    """
    Generate a unique bibtex key for the given record.

    Args:
        record (dict): a record of the bibtex entry.
        all_keys (set): a set of existing bibtex keys in the current context.

    Returns:
        A string which corresponds to the newly generated unique bibtex key.
        The argument 'all_keys' is also appended with the new key.
    """
    for field in ['year', 'title', 'author']:
        if field not in record:
            record_str = json.dumps(record, sort_keys=True, indent=4, separators=(',', ': '))
            raise ValueError("Missing field '{0}' in bibtex entry:\n{1}".format(field, record_str))

    record_copy = record.copy()
    record_copy = bibtexparser.customization.author(record_copy)

    # Retrieve a stripped down last name of the first author
    first_author = record_copy['author'][0]
    stripped = utils.strip_accents(codecs.decode(first_author, "ulatex"))
    last_name = stripped.split(',')[0]
    last_name = last_name.replace('ø', 'o')
    last_name = last_name.replace('ł', 'l')
    last_name = re.sub('([^a-zA-Z])', '', last_name)

    # Then get the first 3 initials of the article title
    curated_title = re.sub('([^a-zA-Z])', ' ', utils.get_title(record_copy))
    short_title = ''.join(s[0] for s in curated_title.split())
    short_title += curated_title.split()[-1][1:]
    short_title = short_title[:3].upper()

    # Key is Author:Year:Initials
    basekey = last_name + ":" + record_copy['year'] + ":" + short_title
    bibkey = basekey

    # Assign a unique key
    tail = 'a'
    while bibkey in all_keys:
        bibkey = basekey + tail
        tail = chr((ord(tail) + 1))

    all_keys.add(bibkey)
    return bibkey
Пример #28
0
	def __init__(self,link):
		self.link = link
		self.title = ""
		self.time = 0
		self.content = ""
		self.keywords = ""
		self.refer = []
		self.status = False # 是否解析成功
        
        # 检查是否在已解析的连接里面
        
		# 检查是否在无法解析的名单内
		if link.find('http://') == -1:
			return # invalid link
		base_url = 'http://' + link.split('/')[2]
		# unparse_check = store.find(UnparsePage_m, UnparsePage_m.url == base_url.decode('utf-8'))
		# if unparse_check.count() != 0:
			# print "can not parse this link"
			# return
		self.pq = ""
		try:
			self.pq = pq(url=link).make_links_absolute() #可能会解析失败
		except Exception as err: 
			print "failed to open this link " + link
		if self.pq == "":
			return
		# get title
		self.title = get_title(self.pq)
		self.time = time.time()
		self.content = get_content(self.pq)
		self.refer = get_refer(self.pq)
		if len(self.title) == 0 or \
		len(self.content) == 0 or len(self.refer) == 0:
			# 无法成功解析
			print "can not parse " + link
			# 把网址添加异常网站数据库
			mpage = UnparsePage_m()
			mpage.url = base_url.decode('utf-8')
			mpage.save()
			self.keywords = ''
			return
		else:
			# get keywords
			self.keywords = jieba.cut_for_search(self.title)
		self.status = True
    def _poll(self):
        url = self.preferences.jenkinsURL
        client = self.preferences.newJenkinsClient()
        # Jobs:
        _jobs = client.get_jobs()
        if self.preferences.extendedInfo:
            for jobDict in _jobs:
                extendedInfo = client.get_job_info(jobDict["name"])
                jobDict.update(extendedInfo)
        # favicon:
        _tempFileName = self._getFavIcon(url)  

        # title:
        try:
            title = get_title(url)
        except Exception, e:
            title = self.UNABLE_TO_CONNECT_TITLE
            print e
Пример #30
0
 def _beacon_update(self, prop):
     """
     Update media properties.
     """
     self.prop = prop
     self.device = str(prop.get('block.device',''))
     self.mountpoint = str(prop.get('volume.mount_point',''))
     log.info('new media %s (%s) at %s', self.id, self.device, self.mountpoint)
     if not self.mountpoint:
         self.mountpoint = self.device
     if not self.mountpoint.endswith('/'):
         self.mountpoint += '/'
     # get basic information from database
     media = self._beacon_controller._beacon_media_information(self)
     if isinstance(media, kaa.InProgress):
         # This will happen for the client because in the client
         # _beacon_media_information needs to lock the db.
         media = yield media
     self.beaconid = media['id']
     prop['beacon.content'] = media['content']
     self._beacon_isdir = False
     if media['content'] == 'file':
         self._beacon_isdir = True
     self.thumbnails = os.path.join(self.overlay, '.thumbnails')
     if self.mountpoint == '/':
         self.thumbnails = os.path.join(os.environ['HOME'], '.thumbnails')
     if self.root.get('title'):
         self.label = self.root.get('title')
     elif prop.get('volume.label'):
         self.label = utils.get_title(prop.get('volume.label'))
     elif prop.get('info.parent'):
         self.label = u''
         parent = prop.get('info.parent')
         if parent.get('storage.vendor'):
             self.label += parent.get('storage.vendor') + u' '
         if parent.get('info.product'):
             self.label += parent.get('info.product')
         self.label.strip()
         if self.device:
             self.label += ' (%s)' % self.device
         if not self.label:
             self.label = self.id
     else:
         self.label = self.id
Пример #31
0
 def _beacon_update(self, prop):
     """
     Update media properties.
     """
     self.prop = prop
     self.device = str(prop.get('block.device', ''))
     self.mountpoint = str(prop.get('volume.mount_point', ''))
     log.info('new media %s (%s) at %s', self.id, self.device,
              self.mountpoint)
     if not self.mountpoint:
         self.mountpoint = self.device
     if not self.mountpoint.endswith('/'):
         self.mountpoint += '/'
     # get basic information from database
     media = self._beacon_controller._beacon_media_information(self)
     if isinstance(media, kaa.InProgress):
         # This will happen for the client because in the client
         # _beacon_media_information needs to lock the db.
         media = yield media
     self.beaconid = media['id']
     prop['beacon.content'] = media['content']
     self._beacon_isdir = False
     if media['content'] == 'file':
         self._beacon_isdir = True
     # TODO: choose self.thumbnails for media not /
     self.thumbnails = os.path.join(os.environ['HOME'], '.thumbnails')
     if self.root.get('title'):
         self.label = self.root.get('title')
     elif prop.get('volume.label'):
         self.label = utils.get_title(prop.get('volume.label'))
     elif prop.get('info.parent'):
         self.label = u''
         parent = prop.get('info.parent')
         if parent.get('storage.vendor'):
             self.label += parent.get('storage.vendor') + u' '
         if parent.get('info.product'):
             self.label += parent.get('info.product')
         self.label.strip()
         if self.device:
             self.label += ' (%s)' % self.device
         if not self.label:
             self.label = self.id
     else:
         self.label = self.id
Пример #32
0
def get_group_post_as_line(post_id, photos_dir):
    try:
        data = driver.find_element_by_xpath(selectors.get("single_post"))
        time = utils.get_time(data)
        title = utils.get_title(data, selectors).text
        # link, status, title, type = get_status_and_title(title,data)
        link = utils.get_div_links(data, "a", selectors)
        if link != "":
            link = link.get_attribute("href")
        post_type = ""
        status = '"' + utils.get_status(data, selectors).replace("\r\n",
                                                                 " ") + '"'
        photos = utils.get_post_photos_links(data, selectors,
                                             photos_small_size)
        comments = get_comments()
        photos = image_downloader(photos, photos_dir)
        line = (str(time) + "||" + str(post_type) + "||" + str(title) + "||" +
                str(status) + "||" + str(link) + "||" + str(post_id) + "||" +
                str(photos) + "||" + str(comments) + "\n")
        return line
    except Exception:
        return ""
Пример #33
0
def detail(inventory=None):
    """
    传统的获取系统信息的方法
    根据系统的inventory获取系统信息,was信息和db2信息并渲染details.html返回
    :param inventory: 系统IP
    :return: details.html: 系统详细信息页面,包括系统信息/was/db2信息
    """
    try:
        system_detail = System.query.filter_by(
            inventory=inventory).first_or_404()
        if PRODUCT:
            # 删除数据库中目前有的was/db2信息
            # app.logger.debug("remove current WebSphere/DB2 info")
            # for one_was in was_detail:
            #     db.session.delete(one_was)
            # for one_db2 in db2_detail:
            #     db.session.delete(one_db2)
            # call ansible function to retrieve websphere,db2,system info for target inventory
            # current only realize get websphere info
            details_host_ok = details_ansible_run(inventory_in=inventory)
            app.logger.debug(system_detail)
            detail_update(system_detail, details_host_ok)
        db.session.commit()
        # app.logger.debug(details_host_ok)
        new_was_detail = WebSphere.query.filter_by(
            sys_inventory=inventory).all()
        new_db2_detail = DB2.query.filter_by(sys_inventory=inventory).all()
        return render_template("details.html",
                               title=get_title("具体信息"),
                               system_detail_in=system_detail,
                               was_detail_in=new_was_detail,
                               db2_detail_in=new_db2_detail)
    except Exception as e:
        app.logger.debug(e)
        # 更新失败,立刻回滚
        db.session.rollback()
        return render_template("500.html")
Пример #34
0
def browse_papers(path_, csv_file, xmin=60):
    print("Processing citations ...")
    dict_1, dict_2 = parse_csv_file(csv_file)

    print("Processing files ...")

    papers_list = {}
    for root, dirs, files in os.walk(path_):
        for name in files:
            if name.endswith(( ".json" )):
                jfile = root + "/" + name
                data = json.load( open(jfile) )

                year,month,day = get_date_jsonfile(jfile,data)
                journal = get_journal_short_json(jfile,data)
                issue,volume = get_issue_volume(jfile,data)
                coauthors = get_coauthors_jsonfile(jfile,data)
                title = get_title(jfile,data)
                doi_ = get_doi(jfile,data)
                
                if doi_ in dict_1.keys():
                    cits_ = len( dict_1[doi_] )
                else:
                    cits_ = 0
                
                if doi_ in dict_2.keys():
                    refs_ = len( dict_2[doi_] )
                else:
                    refs_ = 0

                if cits_ >= xmin:
                    papers_list[doi_] = [ title.encode('utf-8'),str(journal),str(year),str(volume),str(issue),str(cits_),str(refs_) ]


    print("Database processed ...")
    return papers_list
Пример #35
0
def get_all_system():
    sys_was_count_list = []
    sys_db2_count_list = []
    page = request.args.get('page', 1, type=int)
    # 对结果进行分页
    paginate = System.query.paginate(page, NUM_PER_PAGE)
    systems = paginate.items
    for one_system in systems:
        sys_was_count = WebSphere.query.filter_by(
            sys_inventory=one_system.inventory).count()
        sys_db2_count = DB2.query.filter_by(
            sys_inventory=one_system.inventory).count()
        sys_was_count_list.append(sys_was_count)
        sys_db2_count_list.append(sys_db2_count)
    db.session.close()
    return render_template("all_system.html",
                           inventory_filter_val="",
                           title=get_title("主机信息列表"),
                           system_list=systems,
                           pagination=paginate,
                           os_filter_val="",
                           os_list_val=get_os_list(),
                           sys_was_count_list=sys_was_count_list,
                           sys_db2_count_list=sys_db2_count_list)
Пример #36
0
def fetcher():
    for subgroup in d.available_subgroups:
        try:
            cursor.execute("SELECT * FROM %s;" %(subgroup['db_name']))
            search_list = cursor.fetchall()
        except Exception as e:
            logging.error(e)

        for item in search_list:
            search_param = ' '.join(item)
            i = 1
            while True:
                feed_url = ('%s?page=rss&term=%s&user=%s&offset=%d' %(nyaa_url, search_param, subgroup['nyaa_id'], i)).replace(' ', '+')
                i = i + 1
                feed = feedparser.parse(feed_url)
                if not feed['entries']:
                    break
                for feed_entry in feed['entries']:
                    feed_title = feed_entry['title']
                    parsed_title = utils.get_title(feed_title, subgroup['regex']['title'])
                    if parsed_title == item[0]:
                        parsed_episode = utils.get_episode(feed_title, subgroup['regex']['episode'])
                        if parsed_episode:
                            cursor.execute("SELECT 1 FROM downloaded WHERE title='%s' AND episode='%s' AND subgroup = '%s';" %(item[0].replace("'", "''"), parsed_episode, subgroup['subgroup']))
                            if not cursor.fetchone():
                                dl_location = cfg.dl_location + item[0]
                                if not os.path.exists(dl_location):
                                    try:
                                        os.mkdir(dl_location)
                                    except Exception as e:
                                        logging.error(e)
                                payload.append({'title': item[0], 'link': feed_entry['link'], 'episode': parsed_episode, 'subgroup': subgroup['subgroup'], 'quality': item[1]})

            
        #Send payload to WebAPI
        r = requests.post('http://%s:%s/api/addtorrent' %(API_URL, API_PORT), headers={'content-type':'application/json'}, data=json.dumps(payload))
Пример #37
0
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions

from utils import get_title

if __name__ == '__main__':
    options = FirefoxOptions()
    options.headless = True
    options.binary_location = '/usr/local/bin/firefox'

    driver = webdriver.Firefox(executable_path='/usr/local/bin/geckodriver',
                               log_path='/tmp/geckodriver.log',
                               firefox_options=options)

    title = get_title(driver)
    print(title)
Пример #38
0
 def test_get_title_exception_first(self, fake_input, fake_print):
     fake_input.side_effect = ['', 'Test title']
     result = utils.get_title()
     self.assertEqual(result, 'Test title')
     self.assertEqual(fake_print.call_count, 1)
#   query = "subspace+clustering_N100_H1"
    query = "subgraph+mining"
#   query = "data+cleaning_N100_H1"
#   query = "image+descriptor_N100_H1"

    graph = nx.read_gexf("models/%s.gexf" % query, node_type=int)

#   print "The Dense", len(graph.in_edges(637)), \
#                                           sum([a["weight"] for u,v,a in graph.in_edges(637, data=True)]), \
#                                           np.mean([graph.out_degree(u) for u,v in graph.in_edges(637)])
#
#   print "GSpan", len(graph.in_edges(296)), \
#                                   sum([a["weight"] for u,v,a in graph.in_edges(296, data=True)]), \
#                                   np.mean([graph.out_degree(u) for u,v in graph.in_edges(296)])
#   sys.exit()

    rank = rank_nodes(graph, 1.0, 1.0, 1.0, 1.0, ctx_relev=0.5, query_relev=0.5, age_relev=0.5,
                                                limit=15, out_file="graphs/ranks/%s.gexf" % query)

    print
    for node_id, paper_id, query_score, score, score_layers in rank :
        print "{%15s,  %4d,  %3d,  %.4f} : [%.2f]   %-70s  |  %s" % (paper_id,
                                                                                             graph.node[node_id]["year"],
                                                                                             len(graph.in_edges(node_id)),
                                                                                             100*query_score,
                                                                                             100*score,
                                                                                             utils.get_title(paper_id)[:70],
                                                                                             ' '.join(map(str,np.round(100*score_layers,3))))

Пример #40
0
def get_summary_text():
    if request.method == 'GET':
        url = request.args.get('url')
        return jsonify(string=get_summary(url), title=get_title(url))
    return "Not opening!"
Пример #41
0
def extract_and_write_posts(elements, filename):
    try:
        f = open(filename, "w", newline="\r\n")
        f.writelines(
            " TIME || TYPE  || TITLE || STATUS  ||   LINKS(Shared Posts/Shared Links etc) "
            + "\n" + "\n")

        for x in elements:
            try:
                title = " "
                status = " "
                link = ""
                time = " "

                # time
                time = utils.get_time(x)

                # title
                title = utils.get_title(x, selectors)
                if title.text.find("shared a memory") != -1:
                    x = x.find_element_by_xpath(selectors.get("title_element"))
                    title = utils.get_title(x, selectors)

                status = utils.get_status(x, selectors)
                if (title.text == driver.find_element_by_id(
                        selectors.get("title_text")).text):
                    if status == "":
                        temp = utils.get_div_links(x, "img", selectors)
                        if (
                                temp == ""
                        ):  # no image tag which means . it is not a life event
                            link = utils.get_div_links(
                                x, "a", selectors).get_attribute("href")
                            type = "status update without text"
                        else:
                            type = "life event"
                            link = utils.get_div_links(
                                x, "a", selectors).get_attribute("href")
                            status = utils.get_div_links(x, "a",
                                                         selectors).text
                    else:
                        type = "status update"
                        if utils.get_div_links(x, "a", selectors) != "":
                            link = utils.get_div_links(
                                x, "a", selectors).get_attribute("href")

                elif title.text.find(" shared ") != -1:

                    x1, link = utils.get_title_links(title)
                    type = "shared " + x1

                elif title.text.find(" at ") != -1 or title.text.find(
                        " in ") != -1:
                    if title.text.find(" at ") != -1:
                        x1, link = utils.get_title_links(title)
                        type = "check in"
                    elif title.text.find(" in ") != 1:
                        status = utils.get_div_links(x, "a", selectors).text

                elif (title.text.find(" added ") != -1
                      and title.text.find("photo") != -1):
                    type = "added photo"
                    link = utils.get_div_links(x, "a",
                                               selectors).get_attribute("href")

                elif (title.text.find(" added ") != -1
                      and title.text.find("video") != -1):
                    type = "added video"
                    link = utils.get_div_links(x, "a",
                                               selectors).get_attribute("href")

                else:
                    type = "others"

                if not isinstance(title, str):
                    title = title.text

                status = status.replace("\n", " ")
                title = title.replace("\n", " ")

                line = (str(time) + " || " + str(type) + " || " + str(title) +
                        " || " + str(status) + " || " + str(link) + "\n")

                try:
                    f.writelines(line)
                except Exception:
                    print("Posts: Could not map encoded characters")
            except Exception:
                pass
        f.close()
    except Exception:
        print("Exception (extract_and_write_posts)", "Status =",
              sys.exc_info()[0])

    return
Пример #42
0
def browse_papers(path_, csv_file):
    print("Processing citations ...")
    dict_1, dict_2 = parse_csv_file(csv_file)

    #    client = MongoClient('localhost', 27017)
    client = MongoClient()
    db = client['apsdb']  # Get a databese
    aps = db['aps-articles-basic']  # Get a collection

    print("Removing all record ...")
    aps.delete_many({})  # Clean the collection

    print("Processing files ...")

    tmp_list = []
    for root, dirs, files in os.walk(path_):
        for name in files:
            if name.endswith((".json")):
                jfile = root + "/" + name
                data = json.load(open(jfile))

                year, month, day = get_date_jsonfile(jfile, data)
                journal = get_journal_short_json(jfile, data)
                issue, volume = get_issue_volume(jfile, data)
                doi = get_doi(jfile, data)
                num_pages = get_number_of_pages(jfile, data)
                coauthors = get_coauthors_jsonfile(jfile, data)
                affiliations = get_all_affiliations(jfile, data)
                countries = get_all_countries(jfile, data)
                title = get_title(jfile, data)

                aps_paper = {'year': year, 'month': month, 'day': day}
                aps_paper['journal'] = journal
                aps_paper['issue'] = issue
                aps_paper['volume'] = volume
                aps_paper['doi'] = doi
                aps_paper['num_authors'] = len(coauthors)
                aps_paper['num_affs'] = len(affiliations)
                aps_paper['num_countries'] = len(countries)
                aps_paper['title'] = title
                aps_paper['title_length'] = len(title)

                aps_paper['num_pages'] = num_pages

                if doi in dict_1.keys():
                    aps_paper['citations'] = len(dict_1[doi])
                else:
                    aps_paper['citations'] = 0

                if doi in dict_2.keys():
                    aps_paper['num_references'] = len(dict_2[doi])
                else:
                    aps_paper['num_references'] = 0

                tmp_list.append(aps_paper)
                if len(tmp_list) > BIG_LIST_SIZE:
                    aps.insert_many(tmp_list)
                    tmp_list = []

    if len(tmp_list) > 0:
        aps.insert_many(tmp_list)
        tmp_list = []

    return aps