Пример #1
0
    def populate_keywords(self):
        logger.info('Populating keywords.')

        session = self._db_client.get_session()

        for row in self._read_workbook('Keywords_Auth'):
            r_keyword_id, r_category_id, r_name, r_description, r_synonyms = row

            session.add(
                models.AuthorityKeyword(authorityKeywordId=r_keyword_id,
                                        authorityCategoryId=r_category_id,
                                        name=utils.format_string(r_name),
                                        description=utils.format_string(
                                            r_description,
                                            utils.NullBehaviour.NULL_TO_NULL),
                                        synonyms=utils.format_as_list(
                                            r_synonyms,
                                            utils.NullBehaviour.NULL_TO_NULL)))

        for row in self._read_workbook('Keywords_Inq'):
            r_keyword_id, r_category_id, r_name, r_description, r_synonyms = row

            session.add(
                models.InquestKeyword(inquestKeywordId=r_keyword_id,
                                      inquestCategoryId=r_category_id,
                                      name=utils.format_string(r_name),
                                      description=utils.format_string(
                                          r_description,
                                          utils.NullBehaviour.NULL_TO_NULL),
                                      synonyms=utils.format_as_list(
                                          r_synonyms,
                                          utils.NullBehaviour.NULL_TO_NULL)))

        session.commit()
Пример #2
0
 def debug_print(self):
     title, length = utils.format_string("Player.debug_print()", length=50, my_divider="-")
     print(title)
     s = "x,y: ({},{}); name: {}; kind: {}; direction: {}; max_hit_points: {}; "
     s += "hit_points: {}; chance_to_hit: {}; experience: {}; profession: {}; gold: {}"
     s = s.format(self.x, self.y, self.name, self.kind, self.direction, self.max_hit_points,
                  self.hit_points, self.chance_to_hit, self.experience, self.profession, self.gold)
     print(s)
     # self.inventory.debug_print()
     title, length = utils.format_string("- END - Player.debug_print()", length=50, my_divider="-")
     print(title)
Пример #3
0
    def result_round(self, list_tournament: list) -> tuple:
        """
        Changes the points of the completed rounds
        """

        select_tournament = self.choice_tournament(list_tournament)

        for id_tournament, tournament in enumerate(list_tournament):

            if id_tournament == select_tournament:

                self.menu.sub_menu(f"[ Tournament : {tournament.name} ] ")

                if tournament.state is False:
                    print("Tournament finnish")

                else:

                    results_round = list()
                    str_round = f"round_{len(tournament.tours)}"
                    round_indicator = len(tournament.tours) - 1

                    print(
                        f"{'Enter the points of the players of '}{str_round}\n{'-' * 42}"
                    )

                    for number_match, match in enumerate(
                            tournament.tours[round_indicator][str_round]):

                        print(
                            f"\n[ Match {number_match + 1} ] Player N°{match[0][0]} Vs "
                            f"N°{match[1][0]}\n{'-' * 29}")

                        new_point_match_player_1 = float(
                            utils.format_string(
                                f"New point player N°{match[0][0]}"))
                        new_point_match_player_2 = float(
                            utils.format_string(
                                f"New point player N°{match[1][0]}"))

                        match[0], match[1] = list(match[0]), list(match[1])

                        match[0][2], match[1][2] = (
                            new_point_match_player_1,
                            new_point_match_player_2,
                        )

                        match[0], match[1] = tuple(match[0]), tuple(match[1])

                        results_round.append(match)

                    return id_tournament, round_indicator, str_round, results_round
Пример #4
0
 def getPages(self,topic_url):
     """得到topic所有页的内容"""
     root_url = self.root_url
     aim_dir = self.aim_dir
     topic_id = self.topic_id
     enter = self.enter
     response = requests.get(root_url + topic_url)
     charset = utils.auto_detect(root_url + topic_url)
     soup = BeautifulSoup(response.text,"html.parser")
     try:
         #得到第一页的信息
         title = soup.select(".maxtitle")[0].text;
         topic = soup.select(".conttxt .w740")[0].text;#得到楼主的topic
         tag = soup.select(".pages")[1]
         pages = tag['maxindex'] #得到页数
         #帖子ID作为文件名
         pattern = re.compile(r"thread-c-537-(.+?)-1.html")
         topic_id =re.search(pattern, topic_url).group(1)
         aim_file = aim_dir+"/"+topic_id+".txt"
         file = open(aim_file,'w')
         file.write("url:"+response.url+enter)
         file.write("title:"+utils.format_string(title.strip(),'utf-8')+enter)
         file.write("topic:"+utils.format_string(topic.strip(),'utf-8')+enter)
         file.write("pages:"+utils.format_string(pages.strip(),'utf-8')+enter)
         #得到回帖
         replys = soup.select(".w740")
         n=1
         for reply in replys[1:]:#第一个是楼主发的topic,跳过
             comment = self.get_comment(reply)
             logging.debug( "page 1:"+comment )
             file.write("page 1:"+utils.format_string(comment,'utf-8'))
             file.write(enter)
             file.flush()
             n=n+1
         #得到第二页以后的回帖
         if int(pages)>1:
             for page in xrange(2,int(pages)+1):
                 url=response.url.replace("1.html",str(page)+".html")
                 self.getPageOther(page,url,aim_file)
     except Exception as e:
         err_url = self.err_url
         enter = self.enter
         traceback.print_exc()
         traceback.print_exc(file=open(err_url,'w+'))
         logging.error("err_url: %s" % response.url )
         file = open(err_url,'a')
         file.write("err_url: %s" % response.url+enter)
         file.flush()
         pass
     finally:
         if 'file' in locals():
             file.close()
Пример #5
0
    def webhook_logic(self):
        """
            Method to string together all components of the webhook logic
            (i.e. message, carousel, and quick reply handling).
        """

        # build postback flow
        # each postback includes a payload check, a state update,
        # possible data insertion, and a message sending action
        postback_container = []

        for name, data in self.carousels:
            for option in data["options"]:
                payload = option["name"].upper()
                target = option["target"]

                # messages that follow postbacks will always be indexed at 0
                target_content = "%s_0" % option["target"] if \
                    self.bot_configuration[option["target"]]["type"] == "message_list" \
                    else option["target"]

                if "storage" in option:
                    data_insertion = \
"""
state_coll.update({"user_id": sender_id}, {
    "$set": {
        "data.%s": message_payload
    }
}, upsert=False)
"""
                    storage = "_".join(option["storage"].split("."))
                    data_insertion = data_insertion % option["storage"]

                else:
                    data_insertion = ""

                logic = format_string(tl.postback_logic,
                                      payload=option["name"].upper(),
                                      target=option["target"],
                                      data_insertion=data_insertion,
                                      target_content=target_content)

                postback_container.append(logic)

        web_logic = \
            format_string(
                tl.webhook_logic, state_map_template=self.state_creation(),
                postback_control_flow="\n".join(postback_container))

        return web_logic
Пример #6
0
 def getPageOther(self,x,page_url,aim_file):
     try:
         file = open(aim_file,'a')
         enter = self.enter
         logging.debug( "页数>2的url:"+page_url +" , aim_file:"+aim_file )
         response = requests.get(page_url)
         soup = BeautifulSoup(response.text,"html.parser")
         replys = soup.select(".w740")
         for reply in replys:
             comment = self.get_comment(reply)
             logging.debug( "page "+str(x)+":"+comment )
             file.write("page "+str(x)+":"+utils.format_string(comment,'utf-8'))
             file.write(enter)
             file.flush()
     except Exception as e:
         err_url = self.err_url
         enter = self.enter
         traceback.print_exc()
         traceback.print_exc(file=open(err_url,'w+'))
         logging.error("err_url: %s" % response.url )
         file = open(err_url,'a')
         file.write("err_url:"+response.url+enter)
         file.write(enter)
         file.flush()
         pass
     finally:
         if 'file' is locals():
             file.close()
Пример #7
0
 def debug_print(self):
     title, length = utils.format_string("Npc.debug_print()", "-")
     print(title)
     s = "x,y: ({},{}); character_name: {}; character_kind: {}; gold: {}"
     s = s.format(self.x, self.y, self.character_name, self.character_kind, self.gold)
     print(s)
     self.inventory.debug_print()
     print("-" * length)
Пример #8
0
    def populate_sources(self):
        logger.info('Populating sources.')

        session = self._db_client.get_session()

        for row in self._read_workbook('Sources'):
            r_source_id, r_jurisdiction_id, r_name, r_code, r_rank = row

            session.add(
                models.Source(sourceId=r_source_id,
                              jurisdictionId=utils.format_string(
                                  r_jurisdiction_id,
                                  utils.NullBehaviour.NULL_TO_NULL),
                              name=utils.format_string(r_name),
                              code=utils.format_string(r_code),
                              rank=utils.string_to_int(r_rank)))

        session.commit()
Пример #9
0
    def populate_deceased(self):
        logger.info('Populating deceased.')

        session = self._db_client.get_session()

        for row in self._read_workbook('Deceased'):
            (r_inquest_id, r_last_name, r_given_name, r_age, r_age_unit, r_sex,
             r_death_location, r_death_date, r_death_cause, r_death_cause_id,
             r_death_manner_id, r_inquest_reason_id) = row

            # Validate inquest type.
            # TODO: remove.
            if r_inquest_reason_id.startswith('MANDATORY_'):
                # An inquest is either 'Discretionary' or 'Mandatory-<reason>'; this makes 'Mandatory' redundant.
                inquest_reason_id = r_inquest_reason_id.replace(
                    'MANDATORY_', '')
            else:
                inquest_reason_id = r_inquest_reason_id

            # TODO: validate that deceased age is < 18.
            last_name = utils.format_as_title(r_last_name,
                                              utils.NullBehaviour.NULL_TO_NULL)
            given_names = utils.format_string(r_given_name,
                                              utils.NullBehaviour.NULL_TO_NULL)

            session.add(
                models.Deceased(
                    inquestId=r_inquest_id,
                    inquestReasonId=inquest_reason_id,
                    deathMannerId=utils.format_as_id(
                        r_death_manner_id),  # TODO: remove format_as_id call.
                    deathCauseId=r_death_cause_id,
                    deathCause=utils.format_string(r_death_cause),
                    deathDate=utils.string_to_date(r_death_date),
                    deathLocation=utils.format_string(
                        r_death_location, utils.NullBehaviour.NULL_TO_NULL),
                    lastName=last_name,
                    givenNames=given_names,
                    age=utils.string_to_int(r_age),
                    ageUnit=utils.format_string(r_age_unit),
                    sex=utils.format_string(r_sex)))

        session.commit()
Пример #10
0
    def populate_death_causes(self):
        logger.info('Populating causes of death.')

        session = self._db_client.get_session()

        for row in self._read_workbook('Causes'):
            r_cause_id, r_name, r_description, r_synonyms = row

            session.add(
                models.DeathCause(
                    deathCauseId=r_cause_id,
                    name=utils.format_string(r_name),
                    description=utils.format_string(
                        r_description, utils.NullBehaviour.NULL_TO_NULL),
                    synonyms=utils.format_as_list(
                        r_synonyms, utils.NullBehaviour.NULL_TO_NULL),
                ))

        session.commit()
Пример #11
0
    def main_tournament_info(self) -> tuple:
        """
        Add a tournament
        Retrieve name, location, date, description and time control of tournament
        """

        self.menu.sub_menu("* Add A Tournament *")

        name = utils.format_string("Name tournament").capitalize()
        location = utils.format_string("Location").capitalize()
        description = utils.format_string("Description").capitalize()

        date_string = (
            f"\nEnter start date\n{'-' * 16}",
            f"\nEnter end date\n{'-' * 14}",
        )

        date = []

        for value in date_string:

            print(value)

            year = int(utils.format_string("Year"))
            month = int(utils.format_string("Month"))
            day = int(utils.format_string("Day"))
            date.append(datetime.date(year, month, day).strftime("%d/%m/%Y"))

        start_date = date[0]
        end_date = date[1]

        print(f"\nSelect time control\n{'-' * 19}")

        for key, time_control in enumerate(ViewTournament.TIMES_CONTROL):
            print(f"[ {key} ]  {time_control}")

        choice_time_control = int(utils.format_string("Choice time control"))

        time_control = ViewTournament.TIMES_CONTROL[choice_time_control]

        nb_rounds = int(utils.format_string("Number of rounds"))

        return (
            name,
            location,
            start_date,
            end_date,
            time_control,
            description,
            nb_rounds,
        )
Пример #12
0
    def logic_creation(self):
        """
            Primary method for bot/application logic creation. Outputs to 
            separate file.
        """

        al = format_string(tl.base_application_logic,
                           mongo_host=self.mongo_host,
                           user_id=self.user_id,
                           page_access_token=self.pat,
                           verify_token=self.vt,
                           webhook_logic=self.webhook_logic())

        # write content t ofile
        with open("%s/app.py" % self.output_dir, "w") as file:
            file.write(al)

        return True
Пример #13
0
    def add_player_tournament(self, table_players) -> list:
        """
        Add player a tournament
        """

        self.view_player.view_show_player(table_players)

        print(
            f"\n{'Select players by their index to add them to the tournament'}\n"
            f"{'-' * 59}")

        players = []

        for i in range(8):

            player = int(utils.format_string(f"Player {i + 1}"))
            players.append(player)

        return players
Пример #14
0
    def state_creation(self):
        """
            Method to create state object to handle message responses correctly.
            
            Only "nodes" in the message list containers
            Each "node" in the carousel and message list containers should have
            corresponding switches in the state map.
        """
        state_map = {}

        for node, node_data in self.bot_configuration.iteritems():
            if node_data["type"] != "message_list":
                continue

            state_map[node] = {
                "switch": False,
                "index": 0,
                "length": len(node_data["messages"]),
                "list": node_data["messages"],
                "target": node_data["target"]
            }

        state_map["flow_instantiated"] = False

        state_map["current_type"] = ""

        state_map["data"] = {}

        file_content = \
"""
state_map = ~state_map_content~
"""

        # write state map to file
        with open("%s/state.py" % self.output_dir, "w") as file:
            file.write(format_string(file_content,
                                     state_map_content=state_map))

        return state_map
Пример #15
0
 def debug_print(self):
     title, length = utils.format_string("Npcs.debug.print()", "-")
     print(title)
     for elem in self.npcs:
         elem.debug_print()
     print("-" * length)
Пример #16
0
    def populate_references(self):
        session = self._db_client.get_session()

        authority_id_to_related = {
        }  # From ID to tuple of citations, related authorities, and related inquests.
        inquest_id_to_related = {
        }  # From ID to tuple of related authorities and related inquests.

        logger.info('Populating authorities.')

        for row in self._read_workbook('Authorities'):
            (r_authority_id, r_name, r_overview, r_synopsis, r_quotes, r_notes,
             r_primary, r_primary_text, r_judicial_review, r_keywords, r_tags,
             r_citations, r_see_authorities, r_see_inquests) = row

            session.add(
                models.Authority(authorityId=r_authority_id,
                                 isPrimary=r_primary,
                                 primaryField=utils.format_string(
                                     r_primary_text,
                                     utils.NullBehaviour.NULL_TO_NULL),
                                 isJudicialReview=r_judicial_review,
                                 name=utils.format_string(r_name),
                                 overview=utils.format_string(
                                     r_overview,
                                     utils.NullBehaviour.NULL_TO_STRING),
                                 synopsis=utils.format_string(
                                     r_synopsis,
                                     utils.NullBehaviour.NULL_TO_STRING),
                                 quotes=utils.format_string(
                                     r_quotes,
                                     utils.NullBehaviour.NULL_TO_NULL),
                                 notes=utils.format_string(
                                     r_notes,
                                     utils.NullBehaviour.NULL_TO_NULL),
                                 tags=utils.format_as_list(
                                     r_tags,
                                     utils.NullBehaviour.NULL_TO_NULL)))
            session.flush()

            # TODO: behaviour should be NOT NULL.
            for keyword_id in utils.string_to_list(
                    r_keywords, utils.NullBehaviour.NULL_TO_NULL):
                session.add(
                    models.AuthorityKeywords(
                        authorityId=r_authority_id,
                        authorityKeywordId=keyword_id,
                    ))

            authority_id_to_related[r_authority_id] = (r_citations,
                                                       r_see_authorities,
                                                       r_see_inquests)
            self._reference_to_name[(
                self._REFERENCE_TYPE_AUTHORITY,
                int(r_authority_id))] = utils.format_string(r_name)

        logger.info('Populating inquests.')

        for row in self._read_workbook('Inquests'):
            (r_inquest_id, r_name, r_overview, r_synopsis, r_notes,
             r_presiding_officer, r_location, r_start, r_end, r_sitting_days,
             r_exhibits, r_recommendations, r_jurisdiction_id, r_primary,
             r_primary_text, r_keywords, r_tags, r_see_authorities,
             r_see_inquests) = row

            session.add(
                models.Inquest(inquestId=r_inquest_id,
                               jurisdictionId=r_jurisdiction_id,
                               location=utils.format_string(
                                   r_location,
                                   utils.NullBehaviour.NULL_TO_NULL),
                               isPrimary=r_primary,
                               name=utils.format_string(r_name),
                               overview=utils.format_string(
                                   r_overview,
                                   utils.NullBehaviour.NULL_TO_NULL),
                               synopsis=utils.format_string(
                                   r_synopsis,
                                   utils.NullBehaviour.NULL_TO_STRING),
                               notes=utils.format_string(
                                   r_notes, utils.NullBehaviour.NULL_TO_NULL),
                               presidingOfficer=utils.format_string(
                                   r_presiding_officer,
                                   utils.NullBehaviour.NULL_TO_STRING),
                               start=utils.string_to_date(r_start),
                               end=utils.string_to_date(r_end),
                               sittingDays=utils.string_to_int(
                                   r_sitting_days,
                                   utils.NullBehaviour.NULL_TO_NULL),
                               exhibits=utils.string_to_int(
                                   r_exhibits,
                                   utils.NullBehaviour.NULL_TO_NULL),
                               recommendations=utils.string_to_int(
                                   r_recommendations,
                                   utils.NullBehaviour.NULL_TO_NULL),
                               tags=utils.format_as_list(
                                   r_tags, utils.NullBehaviour.NULL_TO_NULL)))
            session.flush()

            inquest_id_to_related[r_inquest_id] = (r_see_authorities,
                                                   r_see_inquests)
            self._reference_to_name[(
                self._REFERENCE_TYPE_INQUEST,
                int(r_inquest_id))] = utils.format_string(r_name)

            # TODO: use behaviour NOT_NULL.
            for keyword_id in utils.string_to_list(
                    r_keywords, utils.NullBehaviour.NULL_TO_NULL):
                # TODO: remove this if statement.
                if keyword_id != 'INQUEST_FEDERAL_PROVINCIAL':
                    session.add(
                        models.InquestKeywords(
                            inquestId=r_inquest_id,
                            inquestKeywordId=keyword_id,
                        ))

        logger.info('Populating authority relationships.')

        for (r_authority_id,
             (r_citations, r_see_authorities,
              r_see_inquests)) in authority_id_to_related.items():
            for citation in utils.string_to_list(
                    r_citations, utils.NullBehaviour.NULL_TO_NULL):
                session.add(
                    models.AuthorityCitations(
                        authorityId=r_authority_id,
                        citedAuthorityId=citation,
                    ))
            for related_authority in utils.string_to_list(
                    r_see_authorities, utils.NullBehaviour.NULL_TO_NULL):
                session.add(
                    models.AuthorityRelated(
                        authorityId=r_authority_id,
                        relatedAuthorityId=related_authority,
                    ))
            for related_inquest in utils.string_to_list(
                    r_see_inquests, utils.NullBehaviour.NULL_TO_NULL):
                session.add(
                    models.AuthorityInquests(
                        authorityId=r_authority_id,
                        inquestId=related_inquest,
                    ))

        logger.info('Populating inquest relationships.')

        # TODO: add tables for inquest relationships.

        session.commit()
Пример #17
0
def new_call(update, context):
    message = update.message
    message_id = message.message_id
    groupchat = update.message.chat
    global global_user_id
    user = message.from_user
    global_user_id = user.id
    username = user['username']
    full_name = "{} {}".format(user['first_name'], user['last_name'])
    print("Got chat id")

    if groupchat.id == user.id:
        print("Chat is user")
        groupchat.send_message(text=new_call_onlygroups_message)
        return ConversationHandler.END
    elif database.find_row_by_id(item_id=update.message.chat.id)[0] == -1:
        print("Chat is not registered yet")
        text = chat_not_registerred.format(new_group_description)
        groupchat.send_message(text=text, parse_mode=ParseMode.HTML)
        return ConversationHandler.END
    else:
        message_text = update.message.text + ' '
        print("Message Text: " + message_text)
        command = message_text[:message_text.find(' ') + 1]
        print(command)
        # ALGORITHM IS NOT WORKING - AND IS SLOW
        values = utils.format_string(message_text, command)

        arguments = values[0]
        missing_arguments = values[1]
        print(missing_arguments)

        # Reset Global Values
        global global_missing_arguments
        global saving
        if (len(global_missing_arguments) > 0):
            global_missing_arguments = global_missing_arguments.clear()
        saving = list(["", "", "", arguments[3], "", ""])

        # ARGUMENTS FORMAT: TITLE, DATE, TIME, DURATION, DESCRIPTION, AGENDA LINK
        if 0 in missing_arguments or 1 in missing_arguments or 2 in missing_arguments:

            global_missing_arguments = missing_arguments.copy()

            print("GET ARGUMENTS")
            if 0 in global_missing_arguments:
                print("Requesting Title input")
                # SEND MESSAGE
                format_input_argument(update, 0, "Title",
                                      global_missing_arguments,
                                      global_missing_arguments.index(0))
                return ADD_TITLE
            elif 1 in global_missing_arguments:
                print("Title is not missing - Requesting Date input")
                # SEND MESSAGE
                format_input_argument(update, 0, "Date",
                                      global_missing_arguments,
                                      global_missing_arguments.index(1))
                return ADD_DATE
            elif 2 in global_missing_arguments:
                print("Date is not missing - Requesting Time input")
                # SEND MESSAGE
                format_input_argument(update, 0, "Time",
                                      global_missing_arguments,
                                      global_missing_arguments.index(2))
                return ADD_TIME

        print(
            "Not returned get arguments -> ALL necessary arguments are alraedy given"
        )
        # SAVE CALL TO DATABASE
        for argument in arguments[4:]:
            if argument == "":
                argument = "N/A"

        save_call_info(update=update,
                       context=context,
                       title=arguments[0],
                       date=arguments[1],
                       time=arguments[2],
                       duration=arguments[3],
                       description=arguments[4],
                       agenda_link=arguments[5])
Пример #18
0
    def content_creation(self):
        """
            Primary method for bot content creation. Outputs content to separate
            file.
        """
        # temporary standard image url
        image_url = "http://messengerdemo.parseapp.com/img/rift.png"

        carousel_container = []

        car_elems = []

        for name, data in self.carousels:
            for option in data["options"]:
                # parse option specs

                title = " ".join(
                    map(lambda x: x[:1].upper() + x[1:],
                        option["name"].replace("_", " ").split(" ")))

                elem = {
                    "title":
                    title,
                    "image_url":
                    image_url,
                    "buttons": [{
                        "type":
                        "postback",
                        "title":
                        title,
                        "payload":
                        option["name"].replace(" ", "_").upper()
                    }]
                }

                car_elems.append(elem)

            carousel_container.append(
                format_string(tl.carousel_content_base,
                              name=name,
                              carousel_elements=car_elems))

        message_list_container = []

        for name, data in self.message_lists:
            # we keep message variable names as <title.index>
            for idx, msg in enumerate(data["messages"]):
                title = "%s_%s" % (name, idx)

                message_list_container.append(
                    format_string(tl.message_content_base,
                                  name=title,
                                  message_text=msg["message"]))

        # create a default first-time greeting message
        if "greeting" not in map(lambda x: x[0], self.message_lists):
            message_list_container.append(
                format_string(tl.message_content_base,
                              name="greeting",
                              message_text="Hello, nice to meet you!"))

        content = format_string(tl.content_base,
                                carousels=",\n".join(carousel_container),
                                messages=",\n".join(message_list_container))

        # write content to file
        with open("%s/content.py" % self.output_dir, "w") as file:
            file.write(content)

        return True
Пример #19
0
    def populate_documents(self):
        logger.info('Populating documents.')

        session = self._db_client.get_session()

        document_sources = set('NO PUBLISH')

        for row in self._read_workbook('Documents'):
            (r_serial, r_reference_type, r_name, r_citation, r_date, r_link,
             r_document_source, r_source_id, r_reference_id, r_primary,
             r_document_type) = row

            document_source_id = utils.format_as_id(r_document_source)

            # Create document source type if it does not exist.
            # TODO: create export table?
            if utils.format_as_id(r_document_source) not in document_sources:
                session.add(
                    models.DocumentSource(
                        documentSourceId=document_source_id,
                        name=utils.format_string(r_document_source),
                    ))
                session.flush()
                document_sources.add(document_source_id)

            # Upload document to S3 if respective file exists locally.
            link = None
            if document_source_id == 'INQUESTS_CA':
                s3_link = self._upload_document_if_exists(
                    r_reference_type, r_name, r_date, r_source_id, r_serial,
                    int(r_reference_id))
                if s3_link is not None:
                    link = s3_link
            elif document_source_id != 'NO PUBLISH':
                link = r_link

            if r_reference_type == self._REFERENCE_TYPE_AUTHORITY:
                authority_document = models.AuthorityDocument(
                    authorityId=r_reference_id,
                    authorityDocumentTypeId=utils.format_as_id(
                        r_document_type, utils.NullBehaviour.NULL_TO_NULL),
                    sourceId=r_source_id,
                    isPrimary=r_primary,
                    name=utils.format_string(r_name),
                    citation=utils.format_string(
                        r_citation, utils.NullBehaviour.NULL_TO_NULL),
                    created=utils.string_to_date(
                        r_date, utils.NullBehaviour.NULL_TO_NULL),
                )
                session.add(authority_document)
                session.flush()
                if link is not None:
                    session.add(
                        models.AuthorityDocumentLinks(
                            authorityDocumentId=authority_document.
                            authorityDocumentId,
                            documentSourceId=document_source_id,
                            link=link,
                        ))
            elif r_reference_type == self._REFERENCE_TYPE_INQUEST:
                inquest_document = models.InquestDocument(
                    inquestId=r_reference_id,
                    inquestDocumentTypeId=utils.format_as_id(
                        r_document_type, utils.NullBehaviour.NULL_TO_NULL),
                    name=utils.format_string(r_name),
                    created=utils.string_to_date(r_date),
                )
                session.add(inquest_document)
                session.flush()
                if link is not None:
                    session.add(
                        models.InquestDocumentLinks(
                            inquestDocumentId=inquest_document.
                            inquestDocumentId,
                            documentSourceId=document_source_id,
                            link=link,
                        ))

        session.commit()
def getPages(topic_url,aim_dir):
    """得到topic所有页的内容"""

    response = requests.get(root_url + topic_url)
    charset = utils.auto_detect(root_url + topic_url)
    soup = BeautifulSoup(response.text,"html.parser")
    try:
        #帖子信息
        title = soup.select(".maxtitle")[0].text;
        topic = soup.select(".conttxt .w740")[0].text;#得到楼主的topic
        tag = soup.select(".pages")[1]
        pages = tag['maxindex'] #得到页数
        date_list = soup.select('span[xname]')#通过是否存在某个属性来查找
        userName_list = soup.select('a[xname="uname"]')#通过属性的值来查找
        userId_list =[]
        user_dict = {}
        #初始化user_id和user_name
        for user_tag in userName_list:
            pattern = re.compile(r"http://i.autohome.com.cn/(.+?)/home.html")
            user_id = re.search(pattern, user_tag['href']).group(1)
            user_name = user_tag.text
            userId_list.append(user_id)
            user_dict[user_id] = user_name

        fatieNum_list = soup.select('a[href$="/bbs.html"]')#发帖数
        huitieNum_list = soup.select('a[href$="/bbs/reply.html"]')#找到以指定属性值结尾的tag
        jinghuaNum_list = soup.select('a[href$="/bbs/wonderful_1.html"]')
        registerDate_list = soup.find_all("li",text=re.compile(u"注册:"))
        comefrom_list = soup.select('a[title="查看该地区论坛"]')
        jinghua_dict = {}

        for jinghua_tag in jinghuaNum_list:
            pattern = re.compile(r"http://i.autohome.com.cn/(.+?)/bbs/wonderful_1.html")
            user_id = re.search(pattern, jinghua_tag['href']).group(1)
            jinghua_num = jinghua_tag.text
            jinghua_dict[user_id] = jinghua_num
        
        #channel_id和帖子ID作为文件名
        user_id = userId_list[0]
        pattern = re.compile(r"thread-(.+?)-1.html")
        topic_id =re.search(pattern, topic_url).group(1)
        aim_file = aim_dir+"/"+topic_id+".txt"
        file = open(aim_file,'w')
        file.write("url:"+response.url+enter)
        file.write("title:"+utils.format_string(title.strip(),'utf-8')+enter)
        file.write("topic:"+utils.format_string(topic.strip(),'utf-8')+enter)
        file.write("topic_date:"+date_list[0].text+enter)
        file.write("user_id:"+user_id+enter)
        file.write("user_name:"+user_dict[user_id]+enter)
        if jinghua_dict.has_key(user_id):
            file.write(" 精华帖:"+jinghua_dict[user_id] +enter)
        else:
            file.write(" 精华帖:0帖"+enter)
        file.write(" 发帖数:"+fatieNum_list[0].text+enter)
        file.write(" 回帖数:"+huitieNum_list[0].text+enter)
        file.write(" 注册时间:"+registerDate_list[0].string.split(":")[1]+enter)
        #来自,关注,爱车 这3个元素是 注册时间的兄弟节点
        for x in registerDate_list[0].find_next_siblings("li"):
            if unicode(x.text).find(u"来自:") > -1:
                file.write ("来自:"+x.text.split(":")[1]+enter)
            if unicode(x.text).find(u"关注:") > -1:
                file.write ("关注:"+x.text.split(":")[1]+enter)
            if unicode(x.text).find(u"爱车:") > -1:
                file.write ("爱车:"+x.text.split(":")[1]+enter)
        file.write("pages:"+utils.format_string(pages.strip(),'utf-8')+enter)
        #得到第一页的回帖
        replys = soup.select(".w740")
        n=1
        for reply in replys[1:]:#第一个是楼主发的topic,跳过
            comment = get_comment(reply)
            user_id = userId_list[n]
            logging.debug(multiprocessing.current_process().name + " page 1:"+comment+" date:"+date_list[n].text )
            file.write("page 1:%s" % utils.format_string(comment,'utf-8')+separate)
            file.write(" date:%s" % date_list[n].text+separate)
            file.write(" user_id:"+user_id+separate)
            file.write(" user_name:"+user_dict[user_id]+separate)
            if jinghua_dict.has_key(user_id):
                file.write(" 精华帖:"+jinghua_dict[user_id]+separate)
            else:
                file.write(" 精华帖:0帖"+separate)
            file.write(" 发帖数:"+fatieNum_list[n].text+separate)
            file.write(" 回帖数:"+huitieNum_list[n].text+separate)
            file.write(" 注册时间:"+registerDate_list[n].string.split(":")[1]+separate)
            for x in registerDate_list[0].find_next_siblings("li"):
                if unicode(x.text).find(u"来自:") > -1:
                    file.write ("来自:"+x.text.split(":")[1]+separate)
                if unicode(x.text).find(u"关注:") > -1:
                    file.write ("关注:"+x.text.split(":")[1]+separate)
                if unicode(x.text).find(u"爱车:") > -1:
                    file.write ("爱车:"+x.text.split(":")[1]+separate)

            file.write(enter)
            file.flush()
            n=n+1

        #得到第二页以后的回帖
        if int(pages)>1:
            for page in xrange(2,int(pages)+1):
                url=response.url.replace("1.html",str(page)+".html")
                getPageOther(page,url,aim_file)
    except Exception as e:
        traceback.print_exc()
        traceback.print_exc(file=open(err_url,'w+'))
        logging.error("err_url: %s" % topic_url )
        file = open(err_url,'a')
        file.write("err_url: %s" % topic_url+enter)
        file.flush()
        pass
    finally:
        if 'file' in locals():
            file.close()
def getPageOther(page_num,page_url,aim_file):
    try:
        file = open(aim_file,'a')
        response = requests.get(page_url)
        soup = BeautifulSoup(response.text,"html.parser")
        replys = soup.select(".w740")
        date_list = soup.select('span[xname]')
        userName_list = soup.select('a[xname="uname"]')#通过属性的值来查找
        userId_list =[]
        user_dict = {}
        #初始化user_id和user_name
        for user_tag in userName_list:
            pattern = re.compile(r"http://i.autohome.com.cn/(.+?)/home.html")
            user_id = re.search(pattern, user_tag['href']).group(1)
            user_name = user_tag.text
            userId_list.append(user_id)
            user_dict[user_id] = user_name

        fatieNum_list = soup.select('a[href$="/bbs.html"]')#发帖数
        huitieNum_list = soup.select('a[href$="/bbs/reply.html"]')#找到以指定属性值结尾的tag
        jinghuaNum_list = soup.select('a[href$="/bbs/wonderful_1.html"]')
        registerDate_list = soup.find_all("li",text=re.compile(u"注册:"))
        comefrom_list = soup.select('a[title="查看该地区论坛"]')
        jinghua_dict = {}
        n = 0
        for reply in replys:
            comment = get_comment(reply)
            logging.debug( multiprocessing.current_process().name+" page "+str(page_num)+":"+comment )
            file.write("page "+str(page_num)+":"+utils.format_string(comment,'utf-8')+separate)
            file.write(" date:%s" % date_list[n].text+separate)
            file.write(" user_id:"+user_id+separate)
            file.write(" user_name:"+user_dict[user_id]+separate)
            if jinghua_dict.has_key(user_id):
                file.write(" 精华帖:"+jinghua_dict[user_id]+separate)
            else:
                file.write(" 精华帖:0帖"+separate)
            file.write(" 发帖数:"+fatieNum_list[n].text+separate)
            file.write(" 回帖数:"+huitieNum_list[n].text+separate)
            file.write(" 注册时间:"+registerDate_list[n].string.split(":")[1]+separate)
            for x in registerDate_list[n].find_next_siblings("li"):
                if unicode(x.text).find(u"来自:") > -1:
                    file.write ("来自:"+x.text.split(":")[1]+separate)
                if unicode(x.text).find(u"关注:") > -1:
                    file.write ("关注:"+x.text.split(":")[1]+separate)
                if unicode(x.text).find(u"爱车:") > -1:
                    file.write ("爱车:"+x.text.split(":")[1]+separate)
            file.write(enter)
            file.flush()
            n=n+1

    except Exception as e:
        traceback.print_exc()
        traceback.print_exc(file=open(err_url,'w+'))
        logging.error("err_url: %s" % page_url )
        file = open(err_url,'a')
        file.write("err_url:"+page_url+enter)
        file.write(enter)
        file.flush()
        pass
    finally:
        if 'file' is locals():
            file.close()
Пример #22
0
 pre_label = label2class[
     prediction_labels[i]]  # get prediction label's name
 if pre_label != 'shift:' + global_null:
     total_uas += 1
     # if not a shift operation, means this operation denote a dependency
     arclabel = pre_label.split(':')
     if arclabel[0] == 'left-arc':
         # stack[-2] depends on stack[-1]
         cld = slists[i][1]  # stack[-2]
         prt = slists[i][0]  # stack[-1]
         if cld is not None and prt is not None:
             # append this to cur_sentence
             cur_sentence[int(
                 cld[0]
             )] = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(
                 cld[0], format_string(cld[1]), cld[4], cld[6],
                 format_string(cld[7]), prt[0],
                 format_string(arclabel[1]),
                 'Matched' if cld[6] == prt[0] else 'Not',
                 'Matched' if cld[7] == arclabel[1] else 'Not')
             if cld[6] == prt[0]:
                 sum_uas += 1
                 if cld[7] == arclabel[1]:
                     sum_las += 1
     else:
         # stack[-1] depends on stack[-2] (root dependency included at here)
         cld = slists[i][0]  # stack[-1]
         prt = slists[i][1]  # stack[-2]
         if prt is None:
             prt = ['0', '0']  # parent is Root default
             # append this to cur_sentence