Пример #1
0
def first_pass(elastic: Elastic) -> None:
    hits = scan(
        client=elastic.es,
        index=elastic.db_mbox,
        # Thanks to elasticsearch_dsl.Q
        # (~Q(...)) | (~Q(...))
        query={
            "query": {
                "bool": {
                    "should": [
                        {
                            "bool": {
                                "must_not": [{"exists": {"field": "forum"}}]
                            }
                        },
                        {
                            "bool": {
                                "must_not": [{"exists": {"field": "size"}}]
                            }
                        },
                    ]
                }
            }
        },
    )
    for hit in hits:
        pid = hit["_id"]
        ojson = hit["_source"]
        ojson["forum"] = ojson.get("list", "").strip("<>").replace(".", "@", 1)
        source = elastic.es.get(
            elastic.db_source, ojson["dbid"], _source="source"
        )["_source"]["source"]
        ojson["size"] = len(source)
        elastic.index(index=elastic.db_mbox, id=pid, body=ojson)
Пример #2
0
def third_pass(elastic: Elastic) -> None:
    hits = scan(client=elastic.es, index=elastic.db_mbox, query={})
    for hit in hits:
        pid = hit["_id"]
        ojson = hit["_source"]
        if ojson["thread"] != "":
            continue
        if ojson["top"] is True:
            ojson["previous"] = archiver.get_previous_mid(elastic, ojson)
            ojson["thread"] = pid
            elastic.index(index=elastic.db_mbox, id=pid, body=ojson)
        else:
            tree = []
            while ojson["thread"] == "":
                tree.append(ojson)
                ojson_parent = archiver.get_parent_info(elastic, ojson)
                if ojson_parent is None:
                    ojson["previous"] = None
                    print("Error:", ojson["mid"], "has no parent")
                    break
                ojson["previous"] = ojson_parent["mid"]
                ojson = ojson_parent
            for info in tree:
                info["thread"] = ojson["thread"]
                elastic.index(index=elastic.db_mbox, id=info["mid"], body=info)
Пример #3
0
def second_pass(elastic: Elastic) -> None:
    hits = scan(
        client=elastic.es,
        index=elastic.db_mbox,
        query={"sort": {"epoch": "asc"}},
    )
    for hit in hits:
        pid = hit["_id"]
        ojson = hit["_source"]
        parent_info = archiver.get_parent_info(elastic, ojson)
        ojson["top"] = parent_info is None
        ojson["previous"] = ""
        ojson["thread"] = pid if (parent_info is None) else ""
        elastic.index(index=elastic.db_mbox, id=pid, body=ojson)
Пример #4
0
def first_pass(elastic: Elastic) -> None:
    hits = scan(
        client=elastic.es,
        index=elastic.db_mbox,
        query={"sort": {
            "epoch": "asc"
        }},
    )
    for hit in hits:
        pid = hit["_id"]
        ojson = hit["_source"]
        parent_info = archiver.get_parent_info(elastic, ojson)
        ojson["top"] = parent_info is None
        ojson["forum"] = ojson.get("list", "").strip("<>").replace(".", "@", 1)
        source = elastic.es.get(elastic.db_source,
                                ojson["dbid"],
                                _source="source")["_source"]["source"]
        ojson["size"] = len(source)
        ojson["previous"] = ""
        ojson["thread"] = pid if (parent_info is None) else ""
        elastic.index(index=elastic.db_mbox, id=pid, body=ojson)
Пример #5
0
#  ** INITIAL VERSION, liable to change **

import argparse
import sys
import yaml
from plugins.elastic import Elastic

# Needs 3.4 or higher to work
if sys.version_info <= (3, 3):
    print("This script requires Python 3.4 or higher in order to work!")
    sys.exit(-1)

# the desired mappings
mapping_file = yaml.safe_load(open("mappings.yaml", "r"))

elastic = Elastic()
major = elastic.engineMajor()
if major != 7:
    print("This script requires ElasticSearch 7 API in order to work!")
    sys.exit(-1)

parser = argparse.ArgumentParser(description="Command line options.")
parser.add_argument(
    "--create",
    dest="create",
    action="store_true",
    help="Create the missing mapping(s)",
)
parser.add_argument(
    "--shards",
    dest="shards",
Пример #6
0
    def archive_message(self,
                        mlist,
                        msg,
                        raw_message=None,
                        dry=False,
                        dump=None,
                        defaultepoch=None,
                        digest=False):
        """Send the message to the archiver.

        :param mlist: The IMailingList object.
        :param msg: The message object.
        :param raw_message: Raw message bytes
        :param dry: Whether or not to actually run
        :param dump: Optional path for dump on fail

        :return (lid, mid)
        """

        lid = textlib.normalize_lid(mlist.list_id, strict=True)
        if lid is None:
            raise ValueError(f"Invalid list id {lid}")

        private = False
        if hasattr(mlist, "archive_public") and mlist.archive_public is True:
            private = False
        elif hasattr(mlist,
                     "archive_public") and mlist.archive_public is False:
            private = True
        elif (hasattr(mlist, "archive_policy")
              and mlist.archive_policy is not ArchivePolicy.public):
            private = True

        if raw_message is None:
            raw_message = msg.as_bytes()
        ojson, contents, msg_metadata, irt, skipit = self.compute_updates(
            lid, private, msg, raw_message, defaultepoch)
        if not ojson:
            _id = msg.get("message-id") or msg.get("Subject") or msg.get(
                "Date")
            raise Exception("Could not parse message %s for %s" % (_id, lid))
        if skipit:
            print(
                "Skipping archiving of email due to invalid date and default date set to skip"
            )
            return lid, "(skipped)"
        if digest:
            return lid, ojson["mid"]
        if dry:
            print("**** Dry run, not saving message to database *****")
            return lid, ojson["mid"]

        if dump:
            try:
                elastic = Elastic()
            except elasticsearch.exceptions.ElasticsearchException as e:
                print(e)
                print(
                    "ES connection failed, but dumponfail specified, dumping to %s"
                    % dump)
        else:
            elastic = Elastic()

        if config.get("archiver", "threadinfo"):
            try:
                timeout = int(config.get("archiver", "threadtimeout") or 5)
                timeout = str(timeout) + "s"
                limit = int(config.get("archiver", "threadparents") or 10)
                ojson = add_thread_properties(elastic, ojson, timeout, limit)
            except Exception as err:
                print("Could not add thread info", err)
                if logger:
                    logger.info("Could not add thread info %s" % (err, ))
            else:
                print("Added thread info successfully", ojson["mid"])
                if logger:
                    logger.info("Added thread info successfully %s" %
                                (ojson["mid"], ))

        try:
            if contents:
                for key in contents:
                    elastic.index(
                        index=elastic.db_attachment,
                        id=key,
                        body={"source": contents[key]},
                    )

            elastic.index(
                index=elastic.db_mbox,
                id=ojson["mid"],
                body=ojson,
            )

            elastic.index(
                index=elastic.db_source,
                id=ojson["dbid"],
                body={
                    "message-id": msg_metadata["message-id"],
                    "source": mbox_source(raw_message),
                },
            )
            # Write to audit log
            try:
                auditlog_exists = elastic.indices.exists(
                    index=elastic.db_auditlog)
            except elasticsearch.exceptions.AuthorizationException:
                auditlog_exists = False
            if auditlog_exists:
                elastic.index(
                    index=elastic.db_auditlog,
                    body={
                        "date":
                        time.strftime("%Y/%m/%d %H:%M:%S",
                                      time.gmtime(time.time())),
                        "action":
                        "index",
                        "remote":
                        "internal",
                        "author":
                        "archiver.py",
                        "target":
                        ojson["mid"],
                        "lid":
                        lid,
                        "log":
                        f"Indexed email {ojson['message-id']} for {lid} as {ojson['mid']}",
                    })

        # If we have a dump dir and ES failed, push to dump dir instead as a JSON object
        # We'll leave it to another process to pick up the slack.
        except Exception as err:
            print(err)
            if dump:
                print(
                    "Pushing to ES failed, but dumponfail specified, dumping JSON docs"
                )
                uid = uuid.uuid4()
                mbox_path = os.path.join(dump, "%s.json" % uid)
                with open(mbox_path, "w") as f:
                    json.dump(
                        {
                            "id": ojson["mid"],
                            "mbox": ojson,
                            "mbox_source": {
                                "id": ojson["dbid"],
                                "permalink": ojson["mid"],
                                "message-id": msg_metadata["message-id"],
                                "source": mbox_source(raw_message),
                            },
                            "attachments": contents,
                        },
                        f,
                        indent=2,
                    )
                    f.close()
                sys.exit(
                    0)  # We're exiting here, the rest can't be done without ES
            # otherwise fail as before
            raise err

        if logger:
            logger.info("Pony Mail archived message %s successfully",
                        ojson["mid"])
        oldrefs = []

        # Is this a direct reply to a pony mail email?
        if irt != "":
            dm = re.search(r"pony-([a-f0-9]+)-([a-f0-9]+)@", irt)
            if dm:
                cid = dm.group(1)
                mid = dm.group(2)
                if elastic.exists(index=elastic.db_account, id=cid):
                    doc = elastic.get(index=elastic.db_account, id=cid)
                    if doc:
                        oldrefs.append(cid)
                        # N.B. no index is supplied, so ES will generate one
                        elastic.index(
                            index=elastic.db_notification,
                            body={
                                "type": "direct",
                                "recipient": cid,
                                "list": lid,
                                "private": private,
                                "date": ojson["date"],
                                "from": msg_metadata["from"],
                                "to": msg_metadata["to"],
                                "subject": msg_metadata["subject"],
                                "message-id": msg_metadata["message-id"],
                                "in-reply-to": irt,
                                "epoch": ojson["epoch"],
                                "mid": mid,
                                "seen": 0,
                            },
                        )
                        if logger:
                            logger.info("Notification sent to %s for %s", cid,
                                        mid)

        # Are there indirect replies to pony emails?
        if msg_metadata.get("references"):
            for im in re.finditer(r"pony-([a-f0-9]+)-([a-f0-9]+)@",
                                  msg_metadata.get("references")):
                cid = im.group(1)
                mid = im.group(2)
                # TODO: Fix this to work with pibbles
                if elastic.exists(index=elastic.db_mbox, id=cid):
                    doc = elastic.get(index=elastic.db_mbox, id=cid)

                    # does the user want to be notified of indirect replies?
                    if (doc and "preferences" in doc["_source"] and
                            doc["_source"]["preferences"].get("notifications")
                            == "indirect" and cid not in oldrefs):
                        oldrefs.append(cid)
                        # N.B. no index mapping is supplied, so ES will generate one
                        elastic.index(
                            index=elastic.db_notification,
                            body={
                                "type": "indirect",
                                "recipient": cid,
                                "list": lid,
                                "private": private,
                                "date": ojson["date"],
                                "from": msg_metadata["from"],
                                "to": msg_metadata["to"],
                                "subject": msg_metadata["subject"],
                                "message-id": msg_metadata["message-id"],
                                "in-reply-to": irt,
                                "epoch": ojson["epoch"],
                                "mid": mid,
                                "seen": 0,
                            },
                        )
                        if logger:
                            logger.info("Notification sent to %s for %s", cid,
                                        mid)
        return lid, ojson["mid"]
# elasticsearch logs lots of warnings on retries/connection failure
logging.getLogger("elasticsearch").setLevel(logging.ERROR)

verbose_logger = None
if args.verbose:
    verbose_logger = logging.getLogger("verbose")
    verbose_logger.setLevel(logging.INFO)
    # The default handler is set to WARN level
    verbose_logger.addHandler(logging.StreamHandler(sys.stdout))
    archiver.logger = verbose_logger

if args.dry:
    print("Dry-run; continuing to check input data")
else:
    # Fetch config and set up ES
    es = Elastic()
    # We need the index name for bulk actions
    dbname = es.getdbname()

    # No point continuing if the index does not exist
    print("Checking that the database index %s exists ... " % dbname)

    # Need to check the index before starting bulk operations
    try:
        if not es.indices.exists(index=es.db_mbox):
            print("Error: the index '%s' does not exist!" % (es.db_mbox))
            sys.exit(1)
        print("Database exists OK")
    except Exception as err:
        print("Error: unable to check if the index %s exists!: %s" %
              (es.db_mbox, err))
Пример #8
0
    verbose_logger = logging.getLogger("verbose")
    verbose_logger.setLevel(logging.INFO)
    # The default handler is set to WARN level
    verbose_logger.addHandler(logging.StreamHandler(sys.stdout))
    archiver.logger = verbose_logger

if args.dry:
    print("Dry-run; continuing to check input data")
    if args.dump:
        print("Writing mbox output to %s" % args.dump[0])
        dumpfile = open(args.dump[0], 'w')
        dumpfile.write("[\n")
else:
    # Fetch config and set up ES
    es = Elastic(
        logger_level=args.logger_level[0] if args.logger_level else None,
        trace_level=args.trace_level[0] if args.trace_level else None)

    # No point continuing if the index does not exist
    print("Checking that the database index %s exists ... " % es.db_mbox)

    # Need to check the index before starting bulk operations
    try:
        if not es.indices.exists(index=es.db_mbox):
            print("Error: the index '%s' does not exist!" % (es.db_mbox))
            sys.exit(1)
        print("Database exists OK")
    except Exception as err:
        print("Error: unable to check if the index %s exists!: %s" %
              (es.db_mbox, err))
        sys.exit(1)
Пример #9
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Utility for retrying docs that we failed to index earlier.
"""

import argparse
import json
import os

if not __package__:
    from plugins.elastic import Elastic
else:
    from .plugins.elastic import Elastic

elastic = Elastic()

parser = argparse.ArgumentParser(description="Command line options.")
parser.add_argument(
    "--source",
    dest="dumpdir",
    help=
    "Path to the directory containing the JSON documents that failed to index")

args = parser.parse_args()

dumpDir = args.dumpdir if args.dumpdir else "."

print("Looking for *.json files in %s" % dumpDir)

files = [
Пример #10
0
def main() -> None:
    elastic: Elastic = Elastic()
    first_pass(elastic)
    second_pass(elastic)
    third_pass(elastic)