예제 #1
0
def parse_answer(answer):
    # Fetching answer
    resp = get_page(ANSWER_URL.format(answer.question, answer.writer_uname))
    doc = BeautifulSoup(resp, 'html.parser', parse_only=ANSWER_STRAIN)

    # Get Credible Users who have upvoted this answer
    users = doc.find('div', class_=CREDIBILITY_CLASS).find_all('a',
                                                               class_='user')
    for user in users:
        Profile.create_or_get(uname=user['href'].split('/')[2],
                              name=user.string)

    # Update answer stats
    answer.views = int(
        doc.find('div', class_=VIEW_ROW_CLASS).strong.string.replace(',', ''))
    answer.upvotes = int(
        doc.find('a', class_=UPVOTE_ROW_CLASS).strong.string.replace(',', ''))
    answer.last_parsed = datetime.datetime.now()
    answer.save()

    # Saving the HTML code of the profile
    # Storing Answers in not feasible.
    #filename = str(answer.id) + '.html'
    #with open(os.path.join(ANSWERS_FOLDER, filename), 'w+') as fstream:
    #  fstream.write(resp)

    sys.stdout.write('\rDone Parsing Answer id %d (%d)' %
                     (answer.id, len(users)))
    sys.stdout.flush()
def parse_answer(answer):
  # Fetching answer
  resp = get_page(ANSWER_URL.format(answer.question, answer.writer_uname))
  doc = BeautifulSoup(resp, 'html.parser', parse_only=ANSWER_STRAIN)

  # Get Credible Users who have upvoted this answer
  users = doc.find('div', class_=CREDIBILITY_CLASS).find_all('a', class_='user')
  for user in users:
    Profile.create_or_get(uname=user['href'].split('/')[2], name=user.string)

  # Update answer stats
  answer.views =  int(doc.find('div', class_=VIEW_ROW_CLASS).strong.string
                      .replace(',', ''))
  answer.upvotes = int(doc.find('a', class_=UPVOTE_ROW_CLASS).strong.string
                       .replace(',', ''))
  answer.last_parsed = datetime.datetime.now()
  answer.save()

  # Saving the HTML code of the profile
  filename = str(answer.id) + '.html'
  with open(os.path.join(ANSWERS_FOLDER, filename), 'w+') as fstream:
    fstream.write(resp)

  sys.stdout.write('\rDone Parsing Answer id %d (%d)' % (answer.id, len(users)))
  sys.stdout.flush()
예제 #3
0
                        action='store_true',
                        help='Do not Crawl Profiles')
    parser.add_argument('--no_answer',
                        action='store_true',
                        help='Do not Crawl Answers')
    args = parser.parse_args()

    # Filling Database with Top Writers 2016
    with open('top_writers_2016.json', 'r') as fstream:
        writer_list = json.load(fstream)
    with open('other_writers.json', 'r') as fstream:
        writer_list += json.load(fstream)
    create_directory(ANSWERS_FOLDER)
    create_directory(PROFILE_FOLDER)
    for writer in writer_list:
        new = Profile.create_or_get(uname=writer['uname'],
                                    name=writer['name'])[1]
        if new: print(u'New Profile %s Created' % writer['uname'])
    #print "Number of Writers Added = ", len(writer_list)

    # Starting to Crawl
    total_parsing = 0
    max_crawl = args.max_crawl
    while total_parsing < max_crawl:
        if not args.no_profile:
            # Parse Old Profiles
            old_time = datetime.datetime.now() - datetime.timedelta(days=7)
            old_profiles = Profile.select().where(
                Profile.last_parsed <= old_time).limit(max_crawl -
                                                       total_parsing)
            total_parsing += len(old_profiles)
            print "Number of Profiles to Crawl - ", len(old_profiles)
                      help='Number of maximum requests to make')
  parser.add_argument('--no_profile', action='store_true',
                      help='Do not Crawl Profiles')
  parser.add_argument('--no_answer', action='store_true',
                      help='Do not Crawl Answers')
  args = parser.parse_args()

  # Filling Database with Top Writers 2016
  with open('top_writers_2016.json', 'r') as fstream:
    writer_list = json.load(fstream)
  with open('other_writers.json', 'r') as fstream:
    writer_list += json.load(fstream)
  create_directory(ANSWERS_FOLDER)
  create_directory(PROFILE_FOLDER)
  for writer in writer_list:
    new = Profile.create_or_get(uname=writer['uname'], name=writer['name'])[1]
    if new: print(u'New Profile %s Created' % writer['uname'])
  #print "Number of Writers Added = ", len(writer_list)

  # Starting to Crawl
  total_parsing = 0
  max_crawl = args.max_crawl
  while total_parsing < max_crawl:
    if not args.no_profile:
      # Parse Old Profiles
      old_time = datetime.datetime.now() - datetime.timedelta(days=7)
      old_profiles = Profile.select().where(
        Profile.last_parsed <= old_time).limit(max_crawl - total_parsing)
      total_parsing += len(old_profiles)
      print "Number of Profiles to Crawl - ", len(old_profiles)
      for profile in old_profiles: